Esempio n. 1
0
    def catch_offer_lists(self, html_content):
        """提取所有页面的岗位信息"""
        metree = lxml.html.etree
        # 处理
        offer_parser = metree.HTML(html_content)
        offer_item = []

        cp_name = offer_parser.xpath(
            "//div[@id='contact']//div[@class='about_info iconboxAll']/h3/text()"
        )[0]
        offer_item.append(cp_name.strip())

        s_name = offer_parser.xpath("//div[@class='black f16']/h1/text()")[0]
        offer_item.append(s_name.strip())

        name = offer_parser.xpath(
            "//div[@id='contact']/div[@class='ml10 mt10 clearfix']/div/text()"
        )[1]
        name = name.strip().split(r"(")[0]
        if name.isdigit():
            offer_item.append("未备注姓名")
        else:
            offer_item.append(name)

        p = offer_parser.xpath(
            "//div[@id='contact']/div[@class='ml10 mt10 clearfix']/p")
        if 2 == int(len(p)):
            phone = offer_parser.xpath(
                "//div[@id='contact']/div[@class='ml10 mt10 clearfix']/p/img/@src"
            )[1]
        elif int(len(p)) == 1:
            phone = offer_parser.xpath(
                "//div[@id='contact']/div[@class='ml10 mt10 clearfix']/p/img/@src"
            )[0]
        else:
            phone = 'http://www.cnxhyp.com/api/image.png.php?auth=8e59p7CdwkARbg9ckaSIZnCmcsmvFULBV1JKM3BJKqD8xqbcDfWwafFQ5w'

        url = phone

        headers = {
            'user-agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36',
            'Referer': 'http://www.cnxhyp.com/sell/show-2702.html'
        }
        req = requests.get(url, headers=headers, proxies=proxypool.get_proxy())
        rsp = req.content
        f = open("./tupian.png", 'wb')
        f.write(rsp)
        f.close()
        image = Image.open('./tupian.png')
        content = pytesseract.image_to_string(image)  # 解析图片
        offer_item.append(content)

        print(offer_item)
        self.offer_datas.append(offer_item)
Esempio n. 2
0
 def parse_offer_url(self, temp_url):
     """爬取整个页面内容"""
     offer_response = requests.get(temp_url,
                                   headers=useragentutil.get_headers(),
                                   proxies=proxypool.get_proxy())
     offer_html_content = offer_response.content.decode("utf-8")
     # 限制处理
     wait_time = random.randint(0, 5)
     print("动态限制访问频率,%ds 后继续爬取数据..." % wait_time)
     time.sleep(wait_time)
     return offer_html_content
Esempio n. 3
0
 def catch_work_info(self, temp_url):
     """提取工作职责信息"""
     try:
         work_response = requests.get(temp_url,
                                      headers=useragentutil.get_headers(),
                                      proxies=proxypool.get_proxy())
         work_html_content = work_response.content.decode("gbk")
         work_parser = lxml.html.etree.HTML(work_html_content)
         work_infos = "".join(
             work_parser.xpath("//div[@class='bmsg job_msg inbox']//text()")
         ).strip().replace(" ", "")  # 清洗数据
         #print("工作职责:",work_infos)
     except Exception:
         work_infos = "暂无数据"
     return work_infos
Esempio n. 4
0
 def get_offer_pages(self):
     """动态获取页面数,int"""
     offer_page_response = requests.get(self.offer_index_url,
                                        headers=useragentutil.get_headers(),
                                        proxies=proxypool.get_proxy())
     # 获取网页源码
     page_html_content = offer_page_response.content.decode("gbk")
     # 解析数据
     metree = lxml.html.etree
     page_parser = metree.HTML(page_html_content)
     # 获得内容值
     pages_content = page_parser.xpath(
         "//div[@class='dw_page']//span[@class='td']/text()")[0]
     pages = int(re.search(r"共(\d+)页", pages_content)[1])
     return pages
Esempio n. 5
0
 def catch_company_info(self, temp_url):
     """提取公司简介信息"""
     try:
         company_response = requests.get(
             temp_url,
             headers=useragentutil.get_headers(),
             proxies=proxypool.get_proxy())
         company_html_content = company_response.content.decode("gbk")
         # 提取数据
         company_parser = lxml.html.etree.HTML(company_html_content)
         company_infos = "".join(
             company_parser.xpath(
                 "//div[@class='con_txt']//text()")).strip().replace(
                     " ", "")
     except Exception:
         company_infos = "暂无数据"
     return company_infos
Esempio n. 6
0
    def catch_work_info(self, temp_url):
        """提取工作职责信息"""
        try:
            work_response = requests.get(temp_url,
                                         headers=useragentutil.get_headers(),
                                         proxies=proxypool.get_proxy())
            work_html_content = work_response.content.decode("utf-8")

            work_parser = lxml.html.etree.HTML(work_html_content)
            work_infos = work_parser.xpath(
                "//div[@class='tabs_box pllist active']//ul[@class='clearfix']/li"
            )
            url = []
            for li in work_infos:
                url.append(li.xpath("./a/@href"))

        except Exception:
            work_infos = "暂无数据"

        return url