class Jobui: def __init__(self): self.url = "https://www.jobui.com/cmp?area=%E5%85%A8%E5%9B%BD&keyword=" self.base_url = "https://www.jobui.com/cmp?" \ "area=%E5%85%A8%E5%9B%BD&industry={}&worker={}&impression={}&type={}&n={}" self.headers = { "Accept": "text/html,application/xhtml+xml," "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Cookie": "jobui_p=1565753151227_21067661; " "jobui_area=%25E6%25B7%25B1%25E5%259C%25B3; " "Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1565753152,1567047709,1567585344; " "PHPSESSID=kkdnm8jingh5vq1g7e1ora7pe3; " "jobui_img_logo=vbBZkTB2kbhlgdb8yFiTPdmw4wCW3uKOYJ%2F4lauoW4o%3D; " "TN_VisitCookie=42; TN_VisitNum=33; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1567585986", "Host": "www.jobui.com", "Pragma": "no-cache", "Referer": "https://www.jobui.com/cmp", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } self.u = Util() self.cursor = self.u.MySQL().cursor() self.data = {"area": "全国", "keyword": ""} self.base_data = { "area": "全国", "industry": "", "worker": "", "impression": "", "type": "" } self.re_try_list = [] self.proxies = self.get_proxy() def get_proxy(self): sql = "select ip, tp from ip_pool where tof = '1';" self.cursor.execute(sql) proxy = self.cursor.fetchall() proxies = {} for p in range(len(proxy)): proxies[proxy[p][0]] = proxy[p][1] return proxies def handle_data(self, req): if req.status_code == 200: html = self.u.get_xpath_obj(req.text) if html.xpath("//div[@class=\"no-result\"]"): print(">>>>>页面无数据") else: urls = [ "https://www.jobui.com" + i for i in html.xpath( "//div[@class=\"company-segmetation\"]/a/@href") ] for url in urls: print(url) try: # 解决多余警告 requests.packages.urllib3.disable_warnings() proxy_key = random.choice(list(self.proxies.keys())) print("<{}>".format(proxy_key)) proxies = {proxy_key: self.proxies[proxy_key]} detail_req = requests.get(url=url, headers=self.headers, proxies=proxies, verify=False) except requests.exceptions.ConnectionError: self.re_try_list.append(url) print("网页未被请求到,已加入重试列表。") continue print("详情页请求完成,响应代码为:{}".format(detail_req.status_code)) detail_html = self.u.get_xpath_obj(detail_req.text) if len( detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dt")) == 4: title = detail_html.xpath("//h1/a/text()")[0].strip() if detail_html.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" ): brief_intro = detail_html.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" )[0].strip() else: brief_intro = "" xingzhi, guimo = detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[1]/text()" )[0].split(" / ") hangye = ";".join([ i.strip() for i in detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[2]/a/text()" ) ]) rongzi = detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd/dd[@class=\"gray3\"]/text()" )[0].strip() quancheng = detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[@class=\"gray3\"]/text()" )[0].strip() intro = "".join( detail_html.xpath( "//*[@id=\"textShowMore\"]/text()")).strip() if len( detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dt")) == 3: title = detail_html.xpath("//h1/a/text()")[0].strip() if detail_html.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" ): brief_intro = detail_html.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" )[0].strip() else: brief_intro = "" xingzhi, guimo = detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[1]/text()" )[0].split(" / ") hangye = ";".join([ i.strip() for i in detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[2]/a/text()" ) ]) rongzi = "" quancheng = detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[@class=\"gray3\"]/text()" )[0].strip() intro = "".join( detail_html.xpath( "//*[@id=\"textShowMore\"]/text()")).strip() else: quancheng = "" title = "" brief_intro = "" xingzhi = "" guimo = "" hangye = "" rongzi = "" quancheng = "" intro = "" id_code = self.u.MD5(quancheng) crawl_time = self.u.get_now_time() sql = "insert into tmp_jobui(id, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, intro, crawl_time) " \ "values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \ % (id_code, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, pymysql.escape_string(intro), crawl_time) self.u.insert2mysql(title, sql) print("-" * 100) # time.sleep(3) else: print("请求失败,错误代码为:{}".format(req.status_code)) def re_try(self): for rt in self.re_try_list: industry = re.findall(r'industry=(.*?)&', rt)[0] worker = re.findall(r'worker=(.*?)&', rt)[0] impression = re.findall(r'impression=(.*?)&', rt)[0] type = re.findall(r'type=(.*?)&', rt)[0] n = re.findall(r'n=(.*?)', rt)[0] self.base_data["industry"] = industry self.base_data["worker"] = worker self.base_data["impression"] = impression self.base_data["type"] = type self.base_data["n"] = n try: proxy_key = random.choice(list(self.proxies.keys())) print("<{}>".format(proxy_key)) proxies = {proxy_key: self.proxies[proxy_key]} requests.packages.urllib3.disable_warnings() r = requests.get(url=rt, headers=self.headers, data=self.base_data, proxies=proxies) self.handle_data(r) except requests.exceptions.ConnectionError: self.re_try_list.append(rt) continue def main(self): proxy_key = random.choice(list(self.proxies.keys())) print("<{}>".format(proxy_key)) proxies = {proxy_key: self.proxies[proxy_key]} try: requests.packages.urllib3.disable_warnings() res = requests.get(url=self.url, headers=self.headers, data=self.data, proxies=proxies, verify=False) print("请求状态码:" + str(res.status_code)) except Exception as e: print("request has Error,Mes:" + str(e)) time.sleep(300) proxy_key = random.choice(list(self.proxies.keys())) print("<{}>".format(proxy_key)) proxies = {proxy_key: self.proxies[proxy_key]} requests.packages.urllib3.disable_warnings() res = requests.get(url=self.url, headers=self.headers, data=self.data, proxies=proxies, verify=False) if res.status_code == 200: html = self.u.get_xpath_obj(res.text) hangye = html.xpath( "//div[@class=\"job-select-box\"]/ul/li[1]/div/div/a/text()") xingzhi = html.xpath( "//div[@class=\"job-select-box\"]/ul/li[2]/div/div/a/text()") guimo = html.xpath( "//div[@class=\"job-select-box\"]/ul/li[3]/div/div/a/text()") tese = html.xpath( "//div[@class=\"job-select-box\"]/ul/li[4]/div/div/a/text()") for a in hangye[1:]: # time.sleep(10) for b in xingzhi[1:]: # time.sleep(10) for c in guimo[1:]: # time.sleep(10) for d in tese[1:]: # time.sleep(5) for i in range(1, 51): # 构建请求地址 print("开始构建请求地址") # time.sleep(2) use_url = self.base_url.format( self.u.url_encode(a), self.u.url_encode(c), self.u.url_encode(d), self.u.url_encode(b), i) # 构建请求参数列表 self.base_data["industry"] = a self.base_data["worker"] = c self.base_data["impression"] = d self.base_data["type"] = b try: proxy_key = random.choice( list(self.proxies.keys())) print("<{}>".format(proxy_key)) proxies = { proxy_key: self.proxies[proxy_key] } requests.packages.urllib3.disable_warnings( ) r = requests.get(url=use_url, headers=self.headers, data=self.base_data, proxies=proxies) except requests.exceptions.ConnectionError: self.re_try_list.append(use_url) continue self.handle_data(r) # time.sleep(10) self.re_try() elif res.status_code == 403: print("403 Forbidden")
class Qlm_zbbg: def __init__(self): self.base_url = "http://www.qianlima.com/zbbg/p{}" self.page = 200 self.util = Util() self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "__jsluid_h=144847f002c5e67a5b7bf1888f49e19c; UM_distinctid=16c02c0e9b53d5-083f7603340745-e343166-144000-16c02c0e9b6403; gr_user_id=bfb0c075-bcf5-4e05-a943-8b3448f39a0d; Hm_lvt_0a38bdb0467f2ce847386f381ff6c0e8=1563432734; LXB_REFER=www.baidu.com; bridgeid=59454367; keywordUnit=40461; keywords=%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91; CNZZDATA1277608403=172402465-1563412202-%7C1563498692; BAIDU_SSP_lcr=https://www.baidu.com/link?url=BUcmE5CDcuTFAv7tI05xeq_80sbO-X-vNsQ1yhUvF_DGdoPt-o7VQs8t7AYRpXBm&wd=&eqid=da58e9c4000e34dc000000065d312603; Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1563414294,1563432734,1563432760,1563502122; qlm_old=\"http://www.qianlima.com/zb/detail/20190719_139475196.html\"; Hm_lpvt_0a38bdb0467f2ce847386f381ff6c0e8=1563502180; qlm_username=15561585051; qlm_password=RCf8ujm8K3EfguKmBCouKpgCKK7uopgU; rem_login=1; qlmll_his=\",139475750,139491436,139497668,139475763,139475196,139264733,139264636,139269995,\"; seo_refUrl=\"http://www.directlyaccess.com\"; seo_curUrl=\"http://www.qianlima.com/common/cat.jsp\"; CNZZDATA1848524=cnzz_eid%3D430053542-1563409337-%26ntime%3D1563503598; fromWhereUrl=\"http://www.qianlima.com/zbbg/\"; seo_intime=\"2019-07-19 10:57:07\"; Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1563506743", "Host": "www.qianlima.com", "Referer": "http://www.qianlima.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", } def get_url_mysql(self): for i in range(200): url = self.base_url.format(i) res = self.util.get_req(url=url, headers=self.headers) html = self.util.get_xpath_obj(res) for dl in html.xpath("//div[@class=\"sevenday_list\"]/dl"): detail_url = dl.xpath("./dt/a/@href")[0].strip() sql = "insert into qlm_zbbg_url(url,status) values ('%s','0')" % detail_url self.util.insert2mysql(detail_url, sql) self.util.MySQL().close() def get_mess(self): conn = self.util.MySQL() cursor = conn.cursor() sql = "select url from qlm_zbbg_url where status=0;" cursor.execute(sql) for detail_url in cursor.fetchall(): print(detail_url[0]) detail_html = self.util.get_xpath_obj( self.util.get_req(url=detail_url[0], headers=self.headers).text) try: detail_title = detail_html.xpath("//h2/text()")[0] detail_location = "".join( detail_html.xpath("//span[@class=\"site\"]/a//text()")) detail_status = detail_html.xpath( "//span[@class=\"zhuangtai\"]//text()")[0].replace( "状态:", "") detail_date = detail_html.xpath( "//span[@class=\"d2\"]/text()")[0] detail_content = re.findall( r'<div id="wen".*?</div>', self.util.get_req(url=detail_url[0], headers=self.headers).text, re.S)[0].replace("\"", "\\\"").replace("\'", "\\\'") record_id = self.util.MD5(detail_title + detail_location) crawl_time = self.util.get_now_time() sql = """insert into INVT_PUB_BID_MDF_INF(ID, TTL, ZON, STS, INVT_PUB_BID_CNTNT, ISU_TM, DTL_LINK, INPT_DT,) values('%s','%s','%s','%s','%s','%s','%s','%s')""" \ % (record_id, detail_title, detail_location, detail_status, detail_date, detail_content, detail_url[0], crawl_time) up_sql = "update qlm_zbbg_url set status = 1 where url = '{}';".format( detail_url[0]) self.util.insert2mysql(detail_title, sql, up_sql) conn.commit() except IndexError: print("详情页请求失败") time.sleep(86400) q = Qlm_zbbg() q.run() def run(self): self.get_url_mysql() self.get_mess()
class FemhzsMofcomGov: def __init__(self): self.base_url = "http://femhzs.mofcom.gov.cn/fecpmvc/pages/fem/CorpJWList_nav.pageNoLink.html?" \ "session=T&sp={}&sp=S+_t1.CORP_CDE%2C+_t1.id&sp=T&sp=S" self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8," "application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Cookie": "JSESSIONID=ACBDC30A40FD783627A075ADB9440B4D; insert_cookie=56224592 ", "Host": "femhzs.mofcom.gov.cn", "Referer": "http://femhzs.mofcom.gov.cn/fecpmvc/pages/fem/CorpJWList.html", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/75.0.3770.100 Safari/537.36", } self.f_headers = { "Host": "femhzs.mofcom.gov.cn", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Referer": "http://www.mofcom.gov.cn/publicService.shtml", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9" } self.util = Util() self.conn = self.util.MySQL() self.page = 0 def insert2mysql(self, sql): try: self.conn.cursor().execute(sql) self.conn.commit() print("插入成功") except pymysql.err.IntegrityError: print("插入失败,数据重复") self.conn.rollback() except pymysql.err.ProgrammingError: print("数据异常,已回滚") self.conn.rollback() def run(self): first_req = requests.get( url="http://femhzs.mofcom.gov.cn/fecpmvc/pages/fem/CorpJWList.html", headers=self.f_headers) cookies = first_req.headers["Set-Cookie"].replace( " Path=/fecpmvc,", "").replace("; path=/", "") try: page = etree.HTML(first_req.text).xpath( "//em[@class=\"m-page-total-num\"]/text()")[0] except TimeoutError: time.sleep(10) page = etree.HTML(first_req.text).xpath( "//em[@class=\"m-page-total-num\"]/text()")[0] except http.client.RemoteDisconnected: time.sleep(10) self.headers["User-Agent"] = random.choice(User_Agent) page = etree.HTML(first_req.text).xpath( "//em[@class=\"m-page-total-num\"]/text()")[0] print("共有:{} 页".format(page)) for i in range(1, int(page)): print(i) data = { "session": "T", "sp": i, "sp": "S _t1.CORP_CDE, _t1.id", "sp": "T", "sp": "S", } self.headers["Cookie"] = cookies url = self.base_url.format(i) try: res = requests.get(url=url, headers=self.headers, data=data, timeout=15) except TimeoutError: time.sleep(10) res = requests.get(url=url, headers=self.headers, data=data, timeout=15) time.sleep(2) if res.status_code == 200: print("请求成功,开始解析") html = etree.HTML(res.text) for tr in html.xpath("//table[@class=\"m-table\"]/tbody/tr"): company_name = tr.xpath("./td[1]/text()")[0].strip() investor_name = tr.xpath("./td[2]/text()")[0].strip() country = tr.xpath("./td[3]/text()")[0].strip() # 公司名称编码作为id md5_company = self.util.MD5(company_name) # 获取当前时间 otherStyleTime = self.util.get_now_time() sql = "insert into EXT_INV_ENTP_LST_INF(ID, OVS_INV_ENTP_NM, OVS_INV_NM, INV_CNR, INPT_DT)values('%s','%s','%s','%s','%s')" % ( md5_company, company_name, investor_name, country, otherStyleTime) self.insert2mysql(sql) else: print("请求失败, HTTP Code:{}".format(res.status_code))