def logmsg(self, grade, msg, mark):
     msg = msg.encode("gbk", 'ignore').decode("gbk", "ignore")
     mark = mark.encode("gbk", 'ignore').decode("gbk", "ignore")
     if grade == 'info':
         log.info({
             "msg": msg,
             "mark": mark,
             "web_targ": self.web_targ,
             "tags": self.tags,
             "logname": log.name,
             "origin_host": self.origin_host,
             "level": "INFO"
         })
     elif grade == 'warning':
         log.warning({
             "msg": msg,
             "mark": mark,
             "web_targ": self.web_targ,
             "tags": self.tags,
             "logname": log.name,
             "origin_host": self.origin_host,
             "level": "WARNING"
         })
     elif grade == 'error':
         log.error({
             "msg": msg,
             "mark": mark,
             "web_targ": self.web_targ,
             "tags": self.tags,
             "logname": log.name,
             "origin_host": self.origin_host,
             "level": "ERROR"
         })
     else:
         pass
Ejemplo n.º 2
0
    def update_proxy(self):
        a = time.time()
        if a - int(float(self.redisconn1.get("adv:edu"))) > 300:
            self.ip = self.get_ip()
            self.redisconn1.set("adv:edu", a)
            log.info({
                "msg": self.ip,
                "mark": "代理过期重申",
                "service": "EduSquare",
                "logname": "全国"
            })
        else:
            self.ip = self.redisconn1.get("adv:WechatSpider").split(",")[0]
            log.info({
                "msg": self.ip,
                "mark": "代理过期复用",
                "service": "EduSquare",
                "logname": "全国"
            })

        self.session.proxies = {
            "http": "http://%s" % self.ip,
            "https": "http://%s" % self.ip
        }
    def start_crawl(self):
        counts = 0
        self.driver.implicitly_wait(60)
        self.driver.delete_all_cookies()
        self.driver.get(self.url)
        cookies = self.driver.get_cookies()
        dicts = {i["name"]: i["value"] for i in cookies}
        time.sleep(0.5)
        headers = {
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36",
            "Sec-Fetch-User": "******",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Sec-Fetch-Site": "same-origin",
            "Sec-Fetch-Mode": "navigate",
            "Referer": "https://kaoshi.china.com/edu/hz/",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7"
        }
        self.session.cookies = requests.utils.cookiejar_from_dict(
            dicts, cookiejar=None, overwrite=True)
        oldurls = []
        cates = [
            "peixun/tuozhan", "peixun/zxx", "peixun/fudao", "peixun/yezj",
            "peixun/yikao", "peixun/shuhua", "peixun/music", "peixun/dance",
            "peixun/qi", "peixun/qiu", "peixun/aihao", "peixun/chinese",
            "peixun/xiaoyu", "pets/peixun", "kouyu/peixun", "toefl/peixun",
            "ielts/peixun", "catti/peixun", "nce/peixun", "waixiao/peixun",
            "cet4/peixun", "jianyan/peixun", "sat/peixun", "xly/peixun",
            "dly/peixun", "zuowen/peixun", "children/peixun", "ap/peixun",
            "gmat/peixun", "igcse/peixun", "pte/peixun", "al/peixun",
            "al/peixun", "tuoye/peixun", "jianqiao/peixun", "ssat/peixun",
            "ib/peixun", "aeas/peixun", "aces/peixun", "isee/peixun",
            "qtlxks/peixun", "peixun/chuguo", "peixun/youxue", "peixun/gjxx"
        ]
        for k in cates:
            for j in range(100):
                try:
                    res = self.session.get(
                        'https://kaoshi.china.com/%s/hz/%d.htm' % (k, j + 2),
                        headers=headers,
                        verify=False,
                        timeout=8).text
                    if res == "":
                        self.update_proxy()
                        time.sleep(3)
                        continue
                except Exception as e:
                    self.logmsg(msg="error" +
                                str(repr(traceback.format_exc())).replace(
                                    "\"", "").replace("\'", ""),
                                mark="")
                    self.update_proxy()
                    time.sleep(3)
                    continue
                #print("jjj", j)
                if "抱歉,没有找到相关课程" in res:
                    break
                # with open("0.txt", "w", encoding="utf-8") as f:
                #     f.write(res)

                onepage = re.findall(r'<span>机构:</span> <a href="(.*?)/">',
                                     res)
                onepage1 = list(set(onepage))
                for i in onepage1:
                    if i not in oldurls:
                        oldurls.append(i)
                        try:
                            res1 = self.session.get(
                                'https://kaoshi.china.com' + i,
                                headers=headers,
                                verify=False,
                                timeout=8).text
                            res2 = self.session.get(
                                'https://kaoshi.china.com' + i + '/introduce/',
                                headers=headers,
                                verify=False,
                                timeout=8).text
                            if res1 == "":
                                self.update_proxy()
                                time.sleep(3)
                                continue
                        except Exception as e:
                            self.logmsg(
                                msg="error" +
                                str(repr(traceback.format_exc())).replace(
                                    "\"", "").replace("\'", ""),
                                mark="")
                            self.update_proxy()
                            time.sleep(3)
                            continue
                        #print("iii",i)
                        res1 = etree.HTML(res1)
                        with open("zhonghua.txt", "w", encoding="utf-8") as f:
                            f.write(res2)
                        pics = re.findall(r'<figure>([\s\S]*?)</figure>', res2)
                        imgs = []
                        if len(pics):
                            for i in pics:
                                imgs.append(
                                    re.findall(r'<img src="(.*?)">', i)[0])
                        name = res1.xpath(
                            '/html/body/div[7]/div[1]/div[2]/p[3]/span[1]/text()'
                        )
                        if len(name):
                            for i in range(3, 20):
                                name = res1.xpath(
                                    '/html/body/div[7]/div[1]/div[2]/p[%d]/span[1]/text()'
                                    % i)
                                if name == []:
                                    break
                                name = name[0] if len(name) else ""
                                area = res1.xpath(
                                    '/html/body/div[7]/div[1]/div[2]/p[%d]/span[2]/text()'
                                    % i)
                                area = area[0] if len(area) else ""
                                phone = res1.xpath(
                                    '/html/body/div[2]/div/span[1]/text()')
                                phone = phone[0] if len(phone) else ""
                                districts = [
                                    "市辖区", "上城区", "下城区", "江干区", "拱墅区", "西湖区",
                                    "滨江区", "萧山区", "余杭区", "经济技术开发区", "风景名胜区",
                                    "桐庐县", "淳安县", "大江东产业集聚区", "建德市", "富阳市",
                                    "临安市"
                                ]
                                for ii in districts:
                                    if ii in area:
                                        district = ii
                                        break
                                    else:
                                        district = ""
                                dt = datetime.datetime.now().strftime(
                                    '%Y-%m-%d %H:%M:%S')
                                province = "浙江"
                                city = "杭州"
                                save_zhonghua(name, area, phone, dt, province,
                                              city, district)
                                if len(imgs):
                                    for i in imgs:
                                        save_img(name, area, i, dt)

                                self.logmsg("info",
                                            msg="success" + "中华" + "|" +
                                            str(i) + name + "|" + area + "|" +
                                            phone + "|" + dt,
                                            mark="中华")
                                counts += 1
                                #print(name, area, phone)
                                time.sleep(1)
                        else:
                            for i in range(3, 20):
                                name = res1.xpath(
                                    '/html/body/div[8]/div[1]/div[2]/p[%d]/span[1]/text()'
                                    % i)
                                if name == []:
                                    break
                                name = name[0] if len(name) else ""
                                if name == "":
                                    continue
                                area = res1.xpath(
                                    '/html/body/div[8]/div[1]/div[2]/p[%d]/span[2]/text()'
                                    % i)
                                area = area[0] if len(area) else ""
                                phone = res1.xpath(
                                    '/html/body/div[2]/div/span[1]/text()')
                                phone = phone[0] if len(phone) else ""

                                districts = [
                                    "市辖区", "上城区", "下城区", "江干区", "拱墅区", "西湖区",
                                    "滨江区", "萧山区", "余杭区", "经济技术开发区", "风景名胜区",
                                    "桐庐县", "淳安县", "大江东产业集聚区", "建德市", "富阳市",
                                    "临安市"
                                ]
                                for ii in districts:
                                    if ii in area:
                                        district = ii
                                        break
                                    else:
                                        district = ""
                                province = "浙江"
                                city = "杭州"
                                dt = datetime.datetime.now().strftime(
                                    '%Y-%m-%d %H:%M:%S')
                                save_zhonghua(name, area, phone, dt, province,
                                              city, district)
                                if len(imgs):
                                    for i in imgs:
                                        save_img(name, area, i, dt)
                                log.info({
                                    "msg":
                                    "success" + "中华" + "|" + str(i) + name +
                                    "|" + area + "|" + phone + "|" + dt,
                                    "mark":
                                    "爬取成功",
                                    "service":
                                    "EduSquare",
                                    "logname":
                                    "中华"
                                })

                                counts += 1
                                time.sleep(1)

        self.driver.quit()
        if counts < 30:
            log.info({
                "msg": "没爬够",
                "mark": "出错报警",
                "service": "EduSquare",
                "logname": "中华"
            })
Ejemplo n.º 4
0
    def start_crawl(self):
        counts = 0
        self.driver.set_page_load_timeout(10)
        try:
            self.driver.get('http://www.dianping.com/hangzhou/education')
        except Exception as e:
            pass
        # time.sleep(3)
        cates = [
            "g2872", "g2873", "g2876", "g2874", "g2878", "g179", "g260",
            "g33757", "g34129", "g32722", "g34107", "g34302", "g2882"
        ]
        lists = []
        for k in cates:
            for j in range(50):
                self.driver.set_page_load_timeout(10)
                try:
                    self.driver.get(
                        'http://www.dianping.com/hangzhou/ch75/%sp%d' %
                        (k, j + 1))
                except:
                    pass
                # self.driver.find_element_by_xpath('/html/body/div[2]/div[1]/ul/li[1]/div[1]/a[1]').click()
                # self.driver.switch_to.window(self.driver.window_handles[1])
                # time.sleep(2)
                # if len(adv):
                #     adv[0].click()
                try:
                    onepage = re.findall(
                        r'<a onclick="LXAnalytics\(\'moduleClick\', \'shoppic\'\)\" target="_blank" href="(.*?)" data-click-name="shop_img_click"',
                        self.driver.page_source)
                except:
                    continue
                #print("onepage", onepage)
                if onepage == lists:
                    break
                lists = onepage
                for l in onepage:
                    #print("dot")
                    dazhong_veri = True
                    if self.first == True:
                        self.driver.set_page_load_timeout(30)
                    else:
                        self.driver.set_page_load_timeout(4)
                    if self.test_ip(self.ip) == True:
                        #print("1111")
                        try:
                            self.driver.get(l)
                            self.first = False
                        except:
                            pass
                        try:
                            assert self.driver.page_source
                        except:
                            #print("verifalse")
                            dazhong_veri = False
                    else:
                        #print("3333")
                        self.driver.quit()
                        self.ip = self.get_ip()
                        self.options.add_argument('--proxy-server=%s' %
                                                  self.ip)
                        self.driver = webdriver.Chrome(
                            options=self.options,
                            executable_path=conf["driver"]["driver_path"])
                        continue
                    if dazhong_veri == False or "验证中心" in self.driver.page_source:
                        log.warning({
                            "msg": "",
                            "mark": "出现验证码",
                            "service": "EduSquare",
                            "logname": "大众"
                        })
                        self.driver.quit()
                        ip = self.get_ip()
                        # cates=self.cates[1:]
                        # #print("cates",cates)
                        self.options.add_argument('--proxy-server=%s' % ip)
                        self.options.add_experimental_option(
                            "excludeSwitches", ['enable-automation'])
                        self.driver = webdriver.Chrome(
                            options=self.options,
                            executable_path=conf["driver"]["driver_path"])
                        self.first = True
                        self.driver.get('http://www.dianping.com/')
                        continue
                    phone = re.findall(
                        r'<span class="item J-phone-hide" data-phone="(.*?)">',
                        self.driver.page_source)
                    phone = phone[0] if len(phone) else ""
                    area = re.findall(
                        r' <span class="item">地址:</span>([\s\S]*?)</div>',
                        self.driver.page_source)
                    area = area[0].strip() if len(area) else ""
                    name = re.findall(r'<h1>(.*?)</h1>',
                                      self.driver.page_source)
                    name = name[0].strip() if len(name) else ""
                    if name == "":
                        #print("2222")
                        # time.sleep(random.choice([3.1,2.3,2.8]))
                        continue
                    districts = [
                        "市辖区", "上城区", "下城区", "江干区", "拱墅区", "西湖区", "滨江区", "萧山区",
                        "余杭区", "经济技术开发区", "风景名胜区", "桐庐县", "淳安县", "大江东产业集聚区",
                        "建德市", "富阳市", "临安市"
                    ]
                    dis = re.findall(
                        r'<div class="breadcrumb">([\s\S]*?)</div>',
                        self.driver.page_source)
                    dis = dis[0] if len(dis) else ""
                    for ii in districts:
                        if ii in dis:
                            district = ii
                            break
                        else:
                            district = ""
                    province = "浙江"
                    city = "杭州"
                    dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    save_meituan_xuexipeixun(name, area, phone, dt, province,
                                             city, district, None, None)
                    files = re.findall(r'<div class="thumb">([\s\S]*?)</div>',
                                       self.driver.page_source)
                    # #print(files)
                    if len(files):
                        files = files[0]
                        video = re.findall(r'data-video="([\s\S]*?)">', files)
                        if len(video):
                            print("video", video)
                            save_video(name, area, video, dt)
                        imgs = re.findall(r'<img src="(.*?)" alt="', files)
                        if len(imgs):
                            for i in imgs:
                                print("imgs", imgs)
                                save_img(name, area, i, dt)
                    log.info({
                        "msg":
                        "success" + "大众|" + name + "|" + area + "|" + phone +
                        "|" + dt,
                        "mark":
                        "爬取成功一篇",
                        "service":
                        "EduSquare",
                        "logname":
                        "大众"
                    })
                    counts += 1
                    # time.sleep(random.choice([3.1,2.3,2.8]))

        self.driver.quit()
        if counts < 30:
            log.error({
                "msg": "fail",
                "mark": "爬取不够",
                "service": "EduSquare",
                "logname": "大众"
            })
    def start_crawl(self):
        counts = 0
        self.driver.implicitly_wait(60)
        self.driver.delete_all_cookies()
        self.driver.get(self.url)
        WebDriverWait(self.driver, 30).until(
            EC.presence_of_element_located(
                (By.XPATH,
                 '/html/body/div[3]/div/div[2]/div/div/div[1]/div[5]/a'
                 ))).click()
        time.sleep(0.5)
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, '#txtUserName'))).send_keys("13282027081")
        time.sleep(0.5)
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, '#txtPwd'))).send_keys("jygc2020")
        time.sleep(0.5)
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((
                By.CSS_SELECTOR,
                "body > div.newlogin-middle > div > div.newlogin-right.b-radius > div > div > a"
            ))).click()
        headers = {
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36",
            "Sec-Fetch-User": "******",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Sec-Fetch-Site": "same-origin",
            "Sec-Fetch-Mode": "navigate",
            "Referer": "https://hz.jiaoyubao.cn/edu/",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7"
        }
        url1 = 'https://hz.jiaoyubao.cn/wudaoxingti/'
        self.driver.get(url1)
        cookies = self.driver.get_cookies()
        dicts = {i["name"]: i["value"] for i in cookies}
        time.sleep(0.5)
        self.session.cookies = requests.utils.cookiejar_from_dict(
            dicts, cookiejar=None, overwrite=True)
        cates = [
            "jueshiwu", "dslingwu", "jiewu", "baleiwu", "dupiwu", "ladingwu",
            "minzuwu", "jianmeicao", "xiandaiwu", "gudianwu", "yueqi",
            "semspx", "qsnmspx", "shufameishu", "caiyi", "weiqi", "xiangqi",
            "guojixiangqi", "guojitiaoqi", "motepeixun", "liyyipeixun",
            "qiannengkaifa", "shougong", "xingqu", "koucai", "guoxue",
            "shengyue", "03sui", "qinzileyuan", "zaojiaotese", "zhilikaifa",
            "gantong", "bantuoban", "teshuzaojiao", "mengshijiao", "xiaoxue",
            "shaoeryingyu", "xialing", "youxiaoxianjie", "chuzhong",
            "gaozhong", "cjgk", "ykpx", "zizhuzhao", "hanjiafudao", "yasi",
            "tuofu", "shaoeryingyu", "qingshao", "apkao", "kouyutingli", "vip",
            "xingainian", "act", "gre", "sat", "jianqiaoyingyu", "xiaoyuzhong",
            "liuxue", "guojijiaoyu", "yishuzuopin"
        ]
        for k in cates:
            for j in range(100):
                #print("j",j)
                try:
                    res = self.session.get(
                        'https://hz.jiaoyubao.cn/%s/p%d.html' % (k, j + 1),
                        headers=headers,
                        verify=False,
                        timeout=8)
                    if res.text == "" or "System error" in res.text or "系统出错" in res.text:
                        self.update_proxy()
                        continue
                except Exception as e:
                    self.update_proxy()
                    continue
                time.sleep(0.5)
                onepage = re.findall(
                    r'<a href="(.*?)" target="_blank" class="office-rlist-name" title="',
                    res.text)
                if '没有找到' in res.text:
                    break
                for i in onepage:
                    url = 'https:' + i if "//" in i else 'https://hz.jiaoyubao.cn' + i
                    try:
                        res1 = self.session.get(url,
                                                headers=headers,
                                                verify=False,
                                                timeout=8)
                        if res1.text == "" or "System error" in res1.text or "系统出错" in res1.text:
                            self.update_proxy()
                            continue
                    except Exception as e:
                        self.update_proxy()
                        continue
                    name = re.findall(r'【(.+?)】', res1.text)
                    name = name[0] if len(name) else ""
                    if name == "":
                        continue
                    area = re.findall(
                        r'<p class="ellipsis-1 fl">([\s\S]+?)</p>', res1.text)
                    area = area[0].replace(' ', '').replace('\n', '').replace(
                        '\t', '').replace('\r', '') if len(area) else ""
                    phone = re.findall(r'<span name="span_tel_400">(.+?)\n',
                                       res1.text)
                    phone = phone[0].replace('</span>', '').replace(
                        ' ', '').replace('\n', '').replace('\t', '').replace(
                            '\r', '') if len(phone) else ""
                    #print(name,area,phone)
                    lng = re.findall(r'var lng = "(.+?)"', res1.text)
                    lng = lng[0] if len(lng) else None
                    # print("lng",lng)
                    lat = re.findall(r'var lat = "(.+?)"', res1.text)
                    lat = lat[0] if len(lat) else None
                    # print("lat",lat)

                    img = re.findall(r'"images": \["(.*?)"],', res1.text)
                    dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    if len(img):
                        for i in img[0].split(","):
                            img = i.replace('"', '')
                            save_img(name, area, img, dt)
                    districts = [
                        "市辖区", "上城区", "下城区", "江干区", "拱墅区", "西湖区", "滨江区", "萧山区",
                        "余杭区", "经济技术开发区", "风景名胜区", "桐庐县", "淳安县", "大江东产业集聚区",
                        "建德市", "富阳市", "临安市"
                    ]
                    dis = re.findall(
                        r'<p class="ellipsis-1 fl">([\s\S]*?)</p>', res1.text)

                    dis = dis[0] if len(dis) else ""
                    for ii in districts:
                        if ii in dis:
                            district = ii
                            break
                        else:
                            district = ""
                    province = "浙江"
                    city = "杭州"
                    #name,area,phone,addtime,province,city,district,lng,lat)
                    save_jioyubao(name, area, phone, dt, province, city,
                                  district, lng, lat)
                    log.info({
                        "msg":
                        "success" + "教育宝" + "|" + name + "|" + area + "|" +
                        phone,
                        "mark":
                        "出错告警",
                        "service":
                        "EduSquare",
                        "logname":
                        "教育宝"
                    })
                    counts += 1
        self.driver.quit()
        if counts < 20:
            log.error({
                "msg": "没爬够",
                "mark": "出错告警",
                "service": "EduSquare",
                "logname": "教育宝"
            })
    def onepage2(self):
        counts = 0
        for i in range(32):
            try:
                WebDriverWait(self.driver, 5).until(
                    EC.presence_of_element_located((
                        By.XPATH,
                        '//*[@id="react"]/div/div/div[2]/div[1]/div[2]/div[2]/div[%d]/div/div/a'
                        % (i + 1)))).click()
            except:
                break

            self.driver.set_page_load_timeout(30)
            try:
                self.driver.switch_to.window(self.driver.window_handles[1])
            except:
                self.driver.execute_script('window.stop()')
            proxy_valid = self.test_ip(self.ip)
            page_valid = True
            try:
                assert self.driver.page_source
            except:
                page_valid = False
            if not proxy_valid or not page_valid:
                self.driver.switch_to.window(self.driver.window_handles[0])
                self.ip = self.get_ip()
                self.logmsg("info", self.ip, "代理过期重申")

                url_zhong = self.driver.current_url
                self.driver.quit()
                return Wxgzh_MeiTuan(self.ip, self.cates,
                                     "").continue_crawl2(url_zhong)
            page = etree.HTML(self.driver.page_source)
            name = page.xpath(
                '//*[@id="react"]/div/div/div[2]/div[1]/h1/text()')
            name = name[0] if len(name) else ""
            if name == "":
                self.driver.close()
                self.driver.switch_to.window(self.driver.window_handles[0])
                continue
            phone = page.xpath(
                '//*[@id="react"]/div/div/div[2]/div[1]/div[2]/div[2]/span[2]/text()'
            )
            phone = phone[0] if len(phone) else ""
            address = page.xpath(
                '//*[@id="react"]/div/div/div[2]/div[1]/div[2]/div[1]/a/span/text()'
            )
            address = address[0] if len(address) else ""
            dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            lng = re.findall(r'var lng = "(.+?)"', self.driver.page_source)
            lat = re.findall(r'var lat = "(.+?)"', self.driver.page_source)
            lng = lng[1] if len(lng) > 1 and '}' not in lng[1] else None
            lat = lat[1] if len(lat) > 1 and '}' not in lat[1] else None
            imgs = re.findall(r'<div class="img-item"(.*?)</div>',
                              self.driver.page_source)
            imgs = list(set(imgs))
            if len(imgs):
                for i in imgs:
                    img = re.findall(r'\((.*?)\)', i)
                    if len(img):
                        save_img(name, address, img, dt)
            districts = [
                "市辖区", "上城区", "下城区", "江干区", "拱墅区", "西湖区", "滨江区", "萧山区", "余杭区",
                "经济技术开发区", "风景名胜区", "桐庐县", "淳安县", "大江东产业集聚区", "建德市", "富阳市",
                "临安市"
            ]
            for ii in districts:
                if ii in address:
                    district = ii
                    break
                else:
                    district = ""
            province = "浙江"
            city = "杭州"
            log.info({
                "msg":
                "success" + "美团|" + name + "|" + address + "|" + phone + "|" +
                dt,
                "mark":
                "爬取成功",
                "service":
                "EduSquare",
                "logname":
                "美团"
            })

            self.driver.close()
            self.driver.switch_to.window(self.driver.window_handles[0])
            # time.sleep(random.choice([3.1, 2.3, 2.8]))
        if counts < 1:
            log.error({
                "msg": "没爬够",
                "mark": "出错报警",
                "service": "EduSquare",
                "logname": "美团"
            })
    def onepage(self):
        counts = 0
        time.sleep(2)
        for i in range(32):
            try:
                WebDriverWait(self.driver, 5).until(
                    EC.presence_of_element_located((
                        By.XPATH,
                        '//*[@id="react"]/div/div/div[2]/div[1]/div[2]/div[2]/div[%d]/div/div/a'
                        % (i + 1)))).click()
            except:
                break

            self.driver.set_page_load_timeout(30)
            try:
                self.driver.switch_to.window(self.driver.window_handles[1])
            except:
                self.driver.execute_script('window.stop()')
            proxy_valid = self.test_ip(self.ip)
            page_valid = True
            try:
                assert self.driver.page_source
            except:
                page_valid = False
            if not proxy_valid or not page_valid:
                print("proxy_valid", proxy_valid)
                print("page_valid", page_valid)
                self.driver.switch_to.window(self.driver.window_handles[0])
                self.ip = self.get_ip()
                self.logmsg("info", self.ip, "代理过期重申")

                page_no = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((
                        By.CSS_SELECTOR,
                        '#react > div > div > div.center-content.clearfix > div.left-content > nav > ul > li.pagination-item.select.num-item'
                    ))).text
                print("url_zhong", page_no)
                self.driver.quit()

                self.options.add_argument("--proxy-server=http://%s" % self.ip)
                self.driver = webdriver.Chrome(
                    options=self.options,
                    executable_path=conf["driver"]["driver_path"])
                self.driver.get(self.url)
                self.driver.refresh()
                url1 = 'https://hz.meituan.com/s/%E5%AD%A6%E4%B9%A0%E5%9F%B9%E8%AE%AD/'
                self.driver.get(url1)
                print("getnewurl")

                while True:
                    mouse = WebDriverWait(self.driver, 10).until(
                        EC.presence_of_element_located((
                            By.CSS_SELECTOR,
                            "#react > div > div > div.center-content.clearfix > div.left-content > div.filter-box > div.filter-section-wrapper > div:nth-child(1) > div.tags > div > div:nth-child(16) > a > span"
                        )))
                    if mouse:
                        break
                time.sleep(3)
                ActionChains(self.driver).move_to_element(mouse).perform()
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((
                        By.XPATH,
                        '//*[@id="react"]/div/div/div[2]/div[1]/div[1]/div[1]/div[2]/div/div/div[%s]/a/span'
                        % list(self.cates.keys())[0]))).click()
                print("self.cates", self.cates)
                while int(
                        WebDriverWait(self.driver, 10).until(
                            EC.presence_of_element_located((
                                By.CSS_SELECTOR,
                                '#react > div > div > div.center-content.clearfix > div.left-content > nav > ul > li.pagination-item.select.num-item'
                            ))).text) < int(page_no):
                    print(
                        WebDriverWait(self.driver, 10).until(
                            EC.presence_of_element_located((
                                By.CSS_SELECTOR,
                                '#react > div > div > div.center-content.clearfix > div.left-content > nav > ul > li.pagination-item.select.num-item'
                            ))).text)
                    time.sleep(1)
                    WebDriverWait(self.driver, 10).until(
                        EC.presence_of_element_located((
                            By.CSS_SELECTOR,
                            '#react > div > div > div.center-content.clearfix > div.left-content > nav > ul > li.pagination-item.next-btn.active > a'
                        ))).click()
                # return Wxgzh_MeiTuan(self.ip, self.cates, "").start_crawl(page_no)
                return self.onepage()

            str1 = re.findall(".push\((.*?)\);", self.driver.page_source)
            res0 = [i for i in str1 if "mapInfo" in i]
            if len(res0):
                res0 = res0[0]
            else:
                self.driver.close()
                self.driver.switch_to.window(self.driver.window_handles[0])
                # time.sleep(random.choice([3.1, 2.3, 2.8]))
                continue
            name = json.loads(res0)['params']['shopInfo']['shopName']
            if name == "":
                self.driver.close()
                self.driver.switch_to.window(self.driver.window_handles[0])
                # time.sleep(random.choice([3.1, 2.3, 2.8]))
                continue
            phone = json.loads(res0)['params']['shopInfo']['phoneNo']
            address = json.loads(res0)['params']['shopInfo']['address']
            dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            lng = re.findall(r'"glng":(.+?),', self.driver.page_source)
            lat = re.findall(r'"glat":(.+?),', self.driver.page_source)
            lng = lng[0] if len(lng) else None
            lat = lat[0] if len(lat) else None
            #print("lng,lat", lng, lat)
            imgs = re.findall(r'<div class="img-item"(.*?)</div>',
                              self.driver.page_source)
            imgs = list(set(imgs))
            if len(imgs):
                for i in imgs:
                    img = re.findall(r'\((.*?)\)', i)
                    if len(img):
                        save_img(name, address, img, dt)
            districts = [
                "市辖区", "上城区", "下城区", "江干区", "拱墅区", "西湖区", "滨江区", "萧山区", "余杭区",
                "经济技术开发区", "风景名胜区", "桐庐县", "淳安县", "大江东产业集聚区", "建德市", "富阳市",
                "临安市"
            ]
            for ii in districts:
                if ii in address:
                    district = ii
                    break
                else:
                    district = ""
            province = "浙江"
            city = "杭州"
            log.info({
                "msg":
                "success" + "美团|" + name + "|" + address + "|" + phone + "|" +
                dt,
                "mark":
                "爬取成功",
                "service":
                "EduSquare",
                "logname":
                "美团"
            })

            counts += 1
            self.driver.close()
            self.driver.switch_to.window(self.driver.window_handles[0])
        if counts < 1:
            log.error({
                "msg": "没爬够",
                "mark": "出错报警",
                "service": "EduSquare",
                "logname": "美团"
            })
Ejemplo n.º 8
0
    def start_crawl(self):
        counts = 0
        self.driver.implicitly_wait(30)
        self.driver.delete_all_cookies()
        self.driver.get(self.url)
        self.driver.get(self.url)
        cookies = self.driver.get_cookies()
        dicts = {i["name"]: i["value"] for i in cookies}
        self.session.cookies = requests.utils.cookiejar_from_dict(
            dicts, cookiejar=None,
            overwrite=True)  #"X-OverrideGateway":self.ip,

        dup = []
        districts = {
            "市辖区": 330101,
            "上城区": 330102,
            "下城区": 330103,
            "江干区": 330104,
            "拱墅区": 330105,
            "西湖区": 330106,
            "滨江区": 330108,
            "萧山区": 330109,
            "余杭区": 330110,
            "经济技术开发区": 330118,
            "风景名胜区": 330119,
            "桐庐县": 330122,
            "淳安县": 330127,
            "大江东产业集聚区": 330128,
            "建德市": 330182,
            "富阳市": 330183,
            "临安市": 330185
        }

        for key, value in districts.items():
            for i in range(50):
                headers = {
                    "Content-Type": "application/x-www-form-urlencoded",
                    "User-Agent":
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36",
                    "Accept": "*/*",
                    "Sec-Fetch-Site": "same-site",
                    "Sec-Fetch-Mode": "cors",
                    "Referer": "http://xwpx.emis.edu.cn/omsweb/org/query/page",
                    "Origin": "http://xwpx.emis.edu.cn",
                    "Accept-Encoding": "gzip, deflate, br",
                    "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7"
                }

                # self.session.keep_alive = False
                # from requests.adapters import HTTPAdapter
                # from urllib3.util import Retry
                # retry = Retry(connect=1, backoff_factor=5)
                # adapter = HTTPAdapter(max_retries=retry)
                # self.session.mount('http://', adapter)
                # self.session.mount('https://', adapter)
                try:
                    content = self.session.get(
                        "http://xwpx.emis.edu.cn/omsweb/captcha.jpg",
                        headers=headers,
                        verify=False,
                        timeout=30).content

                except Exception as e:
                    self.update_proxy()
                    time.sleep(3)
                    continue
                with open("code.jpg", "wb") as f:
                    f.write(content)
                # urllib.request.urlretrieve("http://xwpx.emis.edu.cn/omsweb/captcha.jpg", "local-filename.jpg")
                code = getcode("code.jpg")
                data = {
                    "province": "330000",
                    "city": "330100",
                    "district": str(value),
                    "orgName": "",
                    "legalCode": "1008001",
                    "pageNo": i,
                    "pageSize": "",
                    "code": code
                }
                try:
                    res1 = self.session.post(
                        "http://xwpx.emis.edu.cn/omsweb/org/query/page",
                        headers=headers,
                        data=data,
                        verify=False,
                        timeout=60).text
                    if res1 == "" or "System error" in res1 or "系统出错" in res1:
                        self.update_proxy()
                        continue
                except Exception as e:
                    self.update_proxy()
                    continue

                res = re.findall(r'<a href="#" onclick="viewDetail\((\d+)\)',
                                 res1)
                if res == dup:
                    break
                dup = res
                #print("res",res)
                if len(res):
                    for i in res:
                        headers1 = {
                            "Connection":
                            "keep-alive",
                            "Content-Length":
                            "11",
                            "Cache-Control":
                            "max-age=0",
                            "Origin":
                            "http://xwpx.emis.edu.cn",
                            "Upgrade-Insecure-Requests":
                            "1",
                            "Content-Type":
                            "application/x-www-form-urlencoded",
                            "User-Agent":
                            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36",
                            "Accept":
                            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
                            "Referer":
                            "http://xwpx.emis.edu.cn/omsweb/org/query/page",
                            "Accept-Encoding":
                            "gzip, deflate",
                            "Accept-Language":
                            "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7"
                        }
                        data1 = {"orgId": int(i)}
                        try:
                            res2 = self.session.post(
                                "http://xwpx.emis.edu.cn/omsweb/org/query/info",
                                headers=headers1,
                                data=data1,
                                verify=False,
                                timeout=30).text
                            if res2 == "" or "System error" in res2 or "系统出错" in res2:
                                log.warning({
                                    "msg": "",
                                    "mark": "内容出错" + res2,
                                    "service": "EduSquare",
                                    "logname": "全国"
                                })

                                self.update_proxy()
                                continue
                        except Exception as e:
                            self.update_proxy()
                            continue

                        name = re.findall(
                            r'<p class="panelbody-p fontsize18">([\s\S]+?)</p>',
                            res2)
                        if len(name):
                            name = name[0].replace('\n', '').replace(
                                '\t', '').replace('\r', '')
                        else:
                            continue
                        shelishijian = re.findall(
                            r'设立时间:([\s\S]+?)<',
                            res2)[0].replace('\n',
                                             '').replace('\t',
                                                         '').replace('\r', '')
                        #print("shelishijian",shelishijian)
                        tongyidaima = re.findall(
                            r'统一社会信用代码:([\s\S]+?)<',
                            res2)[0].replace('\n',
                                             '').replace('\t',
                                                         '').replace('\r', '')
                        if tongyidaima == "是":
                            tongyidaima = "办理中"
                        zhucedizhi = re.findall(
                            r'注册地址:([\s\S]+?)<',
                            res2)[0].replace('\n',
                                             '').replace('\t',
                                                         '').replace('\r', '')
                        area = re.findall(r'实际经营地址:([\s\S]+?)<',
                                          res2)[0].replace('\n', '').replace(
                                              '\t', '').replace('\r', '')
                        farendaibiaoxingming = re.findall(
                            r'法定代表人姓名:([\s\S]+?)<',
                            res2)[0].replace('\n',
                                             '').replace('\t',
                                                         '').replace('\r', '')
                        xiaozhangfuzeren = re.findall(
                            r'校长\(负责人\)姓名:([\s\S]+?)<',
                            res2)[0].replace('\n',
                                             '').replace('\t',
                                                         '').replace('\r', '')
                        jubanzhemingcheng = re.findall(
                            r'举办者名称\(姓名\):([\s\S]+?)<',
                            res2)[0].replace('\n',
                                             '').replace('\t',
                                                         '').replace('\r', '')
                        jubanzheshuxing = re.findall(
                            r'举办者属性:([\s\S]+?)<',
                            res2)[0].replace('\n',
                                             '').replace('\t',
                                                         '').replace('\r', '')

                        banxuezizhi = re.findall(
                            r'办学资质说明:([\s\S]+?)<',
                            res2)[0].replace('\n',
                                             '').replace('\t',
                                                         '').replace('\r', '')
                        banxuexukezhenghao = re.findall(
                            r'办学许可证号:([\s\S]+?)<',
                            res2)[0].replace('\n',
                                             '').replace('\t',
                                                         '').replace('\r', '')
                        fazhengjiguan = re.findall(
                            r'发证机关:([\s\S]+?)<',
                            res2)[0].replace('\n',
                                             '').replace('\t',
                                                         '').replace('\r', '')
                        farendengjibumen = re.findall(
                            r'法人登记部门:([\s\S]+?)<',
                            res2)[0].replace('\n',
                                             '').replace('\t',
                                                         '').replace('\r', '')

                        peixunleibie = re.findall(
                            r'培训类别:([\s\S]+?)<',
                            res2)[0].replace('\n',
                                             '').replace('\t',
                                                         '').replace('\r', '')
                        peixunneirong = re.findall(
                            r'培训内容:([\s\S]+?)<',
                            res2)[0].replace('\n',
                                             '').replace('\t',
                                                         '').replace('\r', '')
                        jianzhumianji = re.findall(
                            r'建筑面积\(平方米\):([\s\S]+?)<',
                            res2)[0].replace('\n',
                                             '').replace('\t',
                                                         '').replace('\r', '')
                        province = "浙江"
                        city = "杭州"
                        district = key
                        phone = ""
                        #print("------------------------------------")
                        dt = datetime.datetime.now().strftime(
                            '%Y-%m-%d %H:%M:%S')
                        save_quanguoxiaowai(
                            name=name,
                            shelishijian=shelishijian,
                            tongyidaima=tongyidaima,
                            zhucedizhi=zhucedizhi,
                            peixunneirong=peixunneirong,
                            area=area,
                            farendaibiaoxingming=farendaibiaoxingming,
                            xiaozhangfuzeren=xiaozhangfuzeren,
                            jubanzhemingcheng=jubanzhemingcheng,
                            jubanzheshuxing=jubanzheshuxing,
                            banxuezizhi=banxuezizhi,
                            banxuexukezhenghao=banxuexukezhenghao,
                            fazhengjiguan=fazhengjiguan,
                            farendengjibumen=farendengjibumen,
                            peixunleibie=peixunleibie,
                            jianzhumianji=jianzhumianji,
                            addtime=dt,
                            province=province,
                            city=city,
                            district=district,
                            phone=phone)
                        log.info({
                            "msg":
                            "success" + "全国" + "|" + i + shelishijian + "|" +
                            tongyidaima + "|" + zhucedizhi + "|" + dt,
                            "mark":
                            "爬取成功",
                            "service":
                            "EduSquare",
                            "logname":
                            "全国"
                        })

                        counts += 1
                        # time.sleep(1)
        self.driver.quit()
        if counts < 1:

            log.info({
                "msg": "没爬够",
                "mark": "出错报警",
                "service": "EduSquare",
                "logname": "全国"
            })