Beispiel #1
0
class Jobui:
    def __init__(self):
        self.url = "https://www.jobui.com/cmp?area=%E5%85%A8%E5%9B%BD&keyword="
        self.base_url = "https://www.jobui.com/cmp?" \
                        "area=%E5%85%A8%E5%9B%BD&industry={}&worker={}&impression={}&type={}&n={}"
        self.headers = {
            "Accept":
            "text/html,application/xhtml+xml,"
            "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate, br",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Cache-Control":
            "no-cache",
            "Connection":
            "keep-alive",
            "Cookie":
            "jobui_p=1565753151227_21067661; "
            "jobui_area=%25E6%25B7%25B1%25E5%259C%25B3; "
            "Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1565753152,1567047709,1567585344; "
            "PHPSESSID=kkdnm8jingh5vq1g7e1ora7pe3; "
            "jobui_img_logo=vbBZkTB2kbhlgdb8yFiTPdmw4wCW3uKOYJ%2F4lauoW4o%3D; "
            "TN_VisitCookie=42; TN_VisitNum=33; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1567585986",
            "Host":
            "www.jobui.com",
            "Pragma":
            "no-cache",
            "Referer":
            "https://www.jobui.com/cmp",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
        }
        self.u = Util()
        self.cursor = self.u.MySQL().cursor()
        self.data = {"area": "全国", "keyword": ""}
        self.base_data = {
            "area": "全国",
            "industry": "",
            "worker": "",
            "impression": "",
            "type": ""
        }
        self.re_try_list = []
        self.proxies = self.get_proxy()

    def get_proxy(self):
        sql = "select ip, tp from ip_pool where tof = '1';"
        self.cursor.execute(sql)
        proxy = self.cursor.fetchall()
        proxies = {}
        for p in range(len(proxy)):
            proxies[proxy[p][0]] = proxy[p][1]
        return proxies

    def handle_data(self, req):
        if req.status_code == 200:
            html = self.u.get_xpath_obj(req.text)
            if html.xpath("//div[@class=\"no-result\"]"):
                print(">>>>>页面无数据")
            else:
                urls = [
                    "https://www.jobui.com" + i for i in html.xpath(
                        "//div[@class=\"company-segmetation\"]/a/@href")
                ]
                for url in urls:
                    print(url)
                    try:
                        # 解决多余警告
                        requests.packages.urllib3.disable_warnings()
                        proxy_key = random.choice(list(self.proxies.keys()))
                        print("<{}>".format(proxy_key))
                        proxies = {proxy_key: self.proxies[proxy_key]}
                        detail_req = requests.get(url=url,
                                                  headers=self.headers,
                                                  proxies=proxies,
                                                  verify=False)
                    except requests.exceptions.ConnectionError:
                        self.re_try_list.append(url)
                        print("网页未被请求到,已加入重试列表。")
                        continue
                    print("详情页请求完成,响应代码为:{}".format(detail_req.status_code))
                    detail_html = self.u.get_xpath_obj(detail_req.text)
                    if len(
                            detail_html.xpath(
                                "//div[@class=\"intro\"]/div/dl/dt")) == 4:
                        title = detail_html.xpath("//h1/a/text()")[0].strip()
                        if detail_html.xpath(
                                "//div[@class=\"company-banner-segmetation\"]/p/text()"
                        ):
                            brief_intro = detail_html.xpath(
                                "//div[@class=\"company-banner-segmetation\"]/p/text()"
                            )[0].strip()
                        else:
                            brief_intro = ""
                        xingzhi, guimo = detail_html.xpath(
                            "//div[@class=\"intro\"]/div/dl/dd[1]/text()"
                        )[0].split(" / ")
                        hangye = ";".join([
                            i.strip() for i in detail_html.xpath(
                                "//div[@class=\"intro\"]/div/dl/dd[2]/a/text()"
                            )
                        ])
                        rongzi = detail_html.xpath(
                            "//div[@class=\"intro\"]/div/dl/dd/dd[@class=\"gray3\"]/text()"
                        )[0].strip()
                        quancheng = detail_html.xpath(
                            "//div[@class=\"intro\"]/div/dl/dd[@class=\"gray3\"]/text()"
                        )[0].strip()
                        intro = "".join(
                            detail_html.xpath(
                                "//*[@id=\"textShowMore\"]/text()")).strip()
                    if len(
                            detail_html.xpath(
                                "//div[@class=\"intro\"]/div/dl/dt")) == 3:
                        title = detail_html.xpath("//h1/a/text()")[0].strip()
                        if detail_html.xpath(
                                "//div[@class=\"company-banner-segmetation\"]/p/text()"
                        ):
                            brief_intro = detail_html.xpath(
                                "//div[@class=\"company-banner-segmetation\"]/p/text()"
                            )[0].strip()
                        else:
                            brief_intro = ""
                        xingzhi, guimo = detail_html.xpath(
                            "//div[@class=\"intro\"]/div/dl/dd[1]/text()"
                        )[0].split(" / ")
                        hangye = ";".join([
                            i.strip() for i in detail_html.xpath(
                                "//div[@class=\"intro\"]/div/dl/dd[2]/a/text()"
                            )
                        ])
                        rongzi = ""
                        quancheng = detail_html.xpath(
                            "//div[@class=\"intro\"]/div/dl/dd[@class=\"gray3\"]/text()"
                        )[0].strip()
                        intro = "".join(
                            detail_html.xpath(
                                "//*[@id=\"textShowMore\"]/text()")).strip()
                    else:
                        quancheng = ""
                        title = ""
                        brief_intro = ""
                        xingzhi = ""
                        guimo = ""
                        hangye = ""
                        rongzi = ""
                        quancheng = ""
                        intro = ""
                    id_code = self.u.MD5(quancheng)
                    crawl_time = self.u.get_now_time()
                    sql = "insert into tmp_jobui(id, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, intro, crawl_time) " \
                          "values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \
                          % (id_code, title, brief_intro, xingzhi,
                             guimo, hangye, rongzi, quancheng,
                             pymysql.escape_string(intro), crawl_time)
                    self.u.insert2mysql(title, sql)
                    print("-" * 100)
                    # time.sleep(3)
        else:
            print("请求失败,错误代码为:{}".format(req.status_code))

    def re_try(self):
        for rt in self.re_try_list:
            industry = re.findall(r'industry=(.*?)&', rt)[0]
            worker = re.findall(r'worker=(.*?)&', rt)[0]
            impression = re.findall(r'impression=(.*?)&', rt)[0]
            type = re.findall(r'type=(.*?)&', rt)[0]
            n = re.findall(r'n=(.*?)', rt)[0]

            self.base_data["industry"] = industry
            self.base_data["worker"] = worker
            self.base_data["impression"] = impression
            self.base_data["type"] = type
            self.base_data["n"] = n
            try:
                proxy_key = random.choice(list(self.proxies.keys()))
                print("<{}>".format(proxy_key))
                proxies = {proxy_key: self.proxies[proxy_key]}
                requests.packages.urllib3.disable_warnings()
                r = requests.get(url=rt,
                                 headers=self.headers,
                                 data=self.base_data,
                                 proxies=proxies)
                self.handle_data(r)
            except requests.exceptions.ConnectionError:
                self.re_try_list.append(rt)
                continue

    def main(self):
        proxy_key = random.choice(list(self.proxies.keys()))
        print("<{}>".format(proxy_key))
        proxies = {proxy_key: self.proxies[proxy_key]}
        try:
            requests.packages.urllib3.disable_warnings()
            res = requests.get(url=self.url,
                               headers=self.headers,
                               data=self.data,
                               proxies=proxies,
                               verify=False)
            print("请求状态码:" + str(res.status_code))
        except Exception as e:
            print("request has Error,Mes:" + str(e))
            time.sleep(300)
            proxy_key = random.choice(list(self.proxies.keys()))
            print("<{}>".format(proxy_key))
            proxies = {proxy_key: self.proxies[proxy_key]}
            requests.packages.urllib3.disable_warnings()
            res = requests.get(url=self.url,
                               headers=self.headers,
                               data=self.data,
                               proxies=proxies,
                               verify=False)
        if res.status_code == 200:
            html = self.u.get_xpath_obj(res.text)
            hangye = html.xpath(
                "//div[@class=\"job-select-box\"]/ul/li[1]/div/div/a/text()")
            xingzhi = html.xpath(
                "//div[@class=\"job-select-box\"]/ul/li[2]/div/div/a/text()")
            guimo = html.xpath(
                "//div[@class=\"job-select-box\"]/ul/li[3]/div/div/a/text()")
            tese = html.xpath(
                "//div[@class=\"job-select-box\"]/ul/li[4]/div/div/a/text()")
            for a in hangye[1:]:
                # time.sleep(10)
                for b in xingzhi[1:]:
                    # time.sleep(10)
                    for c in guimo[1:]:
                        # time.sleep(10)
                        for d in tese[1:]:
                            # time.sleep(5)
                            for i in range(1, 51):
                                # 构建请求地址
                                print("开始构建请求地址")
                                # time.sleep(2)
                                use_url = self.base_url.format(
                                    self.u.url_encode(a), self.u.url_encode(c),
                                    self.u.url_encode(d), self.u.url_encode(b),
                                    i)
                                # 构建请求参数列表
                                self.base_data["industry"] = a
                                self.base_data["worker"] = c
                                self.base_data["impression"] = d
                                self.base_data["type"] = b
                                try:
                                    proxy_key = random.choice(
                                        list(self.proxies.keys()))
                                    print("<{}>".format(proxy_key))
                                    proxies = {
                                        proxy_key: self.proxies[proxy_key]
                                    }
                                    requests.packages.urllib3.disable_warnings(
                                    )
                                    r = requests.get(url=use_url,
                                                     headers=self.headers,
                                                     data=self.base_data,
                                                     proxies=proxies)
                                except requests.exceptions.ConnectionError:
                                    self.re_try_list.append(use_url)
                                    continue
                                self.handle_data(r)
                            # time.sleep(10)
            self.re_try()
        elif res.status_code == 403:
            print("403 Forbidden")
Beispiel #2
0
class Qlm_zbbg:
    def __init__(self):
        self.base_url = "http://www.qianlima.com/zbbg/p{}"
        self.page = 200
        self.util = Util()
        self.headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            "Cookie":
            "__jsluid_h=144847f002c5e67a5b7bf1888f49e19c; UM_distinctid=16c02c0e9b53d5-083f7603340745-e343166-144000-16c02c0e9b6403; gr_user_id=bfb0c075-bcf5-4e05-a943-8b3448f39a0d; Hm_lvt_0a38bdb0467f2ce847386f381ff6c0e8=1563432734; LXB_REFER=www.baidu.com; bridgeid=59454367; keywordUnit=40461; keywords=%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91; CNZZDATA1277608403=172402465-1563412202-%7C1563498692; BAIDU_SSP_lcr=https://www.baidu.com/link?url=BUcmE5CDcuTFAv7tI05xeq_80sbO-X-vNsQ1yhUvF_DGdoPt-o7VQs8t7AYRpXBm&wd=&eqid=da58e9c4000e34dc000000065d312603; Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1563414294,1563432734,1563432760,1563502122; qlm_old=\"http://www.qianlima.com/zb/detail/20190719_139475196.html\"; Hm_lpvt_0a38bdb0467f2ce847386f381ff6c0e8=1563502180; qlm_username=15561585051; qlm_password=RCf8ujm8K3EfguKmBCouKpgCKK7uopgU; rem_login=1; qlmll_his=\",139475750,139491436,139497668,139475763,139475196,139264733,139264636,139269995,\"; seo_refUrl=\"http://www.directlyaccess.com\"; seo_curUrl=\"http://www.qianlima.com/common/cat.jsp\"; CNZZDATA1848524=cnzz_eid%3D430053542-1563409337-%26ntime%3D1563503598; fromWhereUrl=\"http://www.qianlima.com/zbbg/\"; seo_intime=\"2019-07-19 10:57:07\"; Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1563506743",
            "Host":
            "www.qianlima.com",
            "Referer":
            "http://www.qianlima.com/",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
        }

    def get_url_mysql(self):
        for i in range(200):
            url = self.base_url.format(i)
            res = self.util.get_req(url=url, headers=self.headers)

            html = self.util.get_xpath_obj(res)
            for dl in html.xpath("//div[@class=\"sevenday_list\"]/dl"):
                detail_url = dl.xpath("./dt/a/@href")[0].strip()
                sql = "insert into qlm_zbbg_url(url,status) values ('%s','0')" % detail_url
                self.util.insert2mysql(detail_url, sql)
        self.util.MySQL().close()

    def get_mess(self):
        conn = self.util.MySQL()
        cursor = conn.cursor()
        sql = "select url from qlm_zbbg_url where status=0;"
        cursor.execute(sql)
        for detail_url in cursor.fetchall():
            print(detail_url[0])
            detail_html = self.util.get_xpath_obj(
                self.util.get_req(url=detail_url[0],
                                  headers=self.headers).text)
            try:
                detail_title = detail_html.xpath("//h2/text()")[0]
                detail_location = "".join(
                    detail_html.xpath("//span[@class=\"site\"]/a//text()"))
                detail_status = detail_html.xpath(
                    "//span[@class=\"zhuangtai\"]//text()")[0].replace(
                        "状态:", "")
                detail_date = detail_html.xpath(
                    "//span[@class=\"d2\"]/text()")[0]
                detail_content = re.findall(
                    r'<div id="wen".*?</div>',
                    self.util.get_req(url=detail_url[0],
                                      headers=self.headers).text,
                    re.S)[0].replace("\"", "\\\"").replace("\'", "\\\'")
                record_id = self.util.MD5(detail_title + detail_location)
                crawl_time = self.util.get_now_time()
                sql = """insert into INVT_PUB_BID_MDF_INF(ID, TTL, ZON, STS, INVT_PUB_BID_CNTNT, ISU_TM, DTL_LINK, INPT_DT,)
                                                    values('%s','%s','%s','%s','%s','%s','%s','%s')""" \
                      % (record_id,
                         detail_title,
                         detail_location,
                         detail_status,
                         detail_date,
                         detail_content,
                         detail_url[0],
                         crawl_time)
                up_sql = "update qlm_zbbg_url set status = 1  where url = '{}';".format(
                    detail_url[0])
                self.util.insert2mysql(detail_title, sql, up_sql)
                conn.commit()
            except IndexError:
                print("详情页请求失败")
                time.sleep(86400)
                q = Qlm_zbbg()
                q.run()

    def run(self):
        self.get_url_mysql()
        self.get_mess()
Beispiel #3
0
class FemhzsMofcomGov:
    def __init__(self):
        self.base_url = "http://femhzs.mofcom.gov.cn/fecpmvc/pages/fem/CorpJWList_nav.pageNoLink.html?" \
                        "session=T&sp={}&sp=S+_t1.CORP_CDE%2C+_t1.id&sp=T&sp=S"
        self.headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,"
            "application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Connection":
            "keep-alive",
            "Cookie":
            "JSESSIONID=ACBDC30A40FD783627A075ADB9440B4D; insert_cookie=56224592  ",
            "Host":
            "femhzs.mofcom.gov.cn",
            "Referer":
            "http://femhzs.mofcom.gov.cn/fecpmvc/pages/fem/CorpJWList.html",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/75.0.3770.100 Safari/537.36",
        }
        self.f_headers = {
            "Host": "femhzs.mofcom.gov.cn",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Referer": "http://www.mofcom.gov.cn/publicService.shtml",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9"
        }
        self.util = Util()
        self.conn = self.util.MySQL()

        self.page = 0

    def insert2mysql(self, sql):
        try:
            self.conn.cursor().execute(sql)
            self.conn.commit()
            print("插入成功")
        except pymysql.err.IntegrityError:
            print("插入失败,数据重复")
            self.conn.rollback()
        except pymysql.err.ProgrammingError:
            print("数据异常,已回滚")
            self.conn.rollback()

    def run(self):
        first_req = requests.get(
            url="http://femhzs.mofcom.gov.cn/fecpmvc/pages/fem/CorpJWList.html",
            headers=self.f_headers)
        cookies = first_req.headers["Set-Cookie"].replace(
            " Path=/fecpmvc,", "").replace("; path=/", "")
        try:
            page = etree.HTML(first_req.text).xpath(
                "//em[@class=\"m-page-total-num\"]/text()")[0]
        except TimeoutError:
            time.sleep(10)
            page = etree.HTML(first_req.text).xpath(
                "//em[@class=\"m-page-total-num\"]/text()")[0]
        except http.client.RemoteDisconnected:
            time.sleep(10)
            self.headers["User-Agent"] = random.choice(User_Agent)
            page = etree.HTML(first_req.text).xpath(
                "//em[@class=\"m-page-total-num\"]/text()")[0]
        print("共有:{} 页".format(page))
        for i in range(1, int(page)):
            print(i)
            data = {
                "session": "T",
                "sp": i,
                "sp": "S _t1.CORP_CDE, _t1.id",
                "sp": "T",
                "sp": "S",
            }
            self.headers["Cookie"] = cookies
            url = self.base_url.format(i)
            try:
                res = requests.get(url=url,
                                   headers=self.headers,
                                   data=data,
                                   timeout=15)
            except TimeoutError:
                time.sleep(10)
                res = requests.get(url=url,
                                   headers=self.headers,
                                   data=data,
                                   timeout=15)
            time.sleep(2)
            if res.status_code == 200:
                print("请求成功,开始解析")
                html = etree.HTML(res.text)
                for tr in html.xpath("//table[@class=\"m-table\"]/tbody/tr"):
                    company_name = tr.xpath("./td[1]/text()")[0].strip()
                    investor_name = tr.xpath("./td[2]/text()")[0].strip()
                    country = tr.xpath("./td[3]/text()")[0].strip()
                    # 公司名称编码作为id
                    md5_company = self.util.MD5(company_name)
                    # 获取当前时间
                    otherStyleTime = self.util.get_now_time()

                    sql = "insert into EXT_INV_ENTP_LST_INF(ID, OVS_INV_ENTP_NM, OVS_INV_NM, INV_CNR, INPT_DT)values('%s','%s','%s','%s','%s')" % (
                        md5_company, company_name, investor_name, country,
                        otherStyleTime)
                    self.insert2mysql(sql)
            else:
                print("请求失败, HTTP Code:{}".format(res.status_code))