Exemple #1
0
 def fetch_xici(self, num):
     """抓取http://www.xicidaili.com/,质量10%"""
     page = 1
     proxyes = []
     while len(proxyes) <= num and page <= 2:
         url = "http://www.xicidaili.com/nn/%s" %page
         req = requests.get(url, headers=self.headers)
         html = req.text
         selector = etree.HTML(html)
         tbody = selector.xpath('//tr[@class]')
         for line in tbody:
             tds = line.xpath('td/text()')
             ip = tds[0]
             port = tds[1]
             speed = line.xpath('td[7]/div/@title')[0][:-1]
             latency = line.xpath('td[8]/div/@title')[0][:-1]
 #             print('%s,%s,%s,%s'%(ip, port, speed, latency))
             if float(speed) < 3 and float(latency) < 1:
                 proxy = "%s:%s"%(ip, port)
                 proxy_dict = {'http':proxy, 'https':proxy}
                 valid_res = self.proxy_vaild(proxy_dict)
                 if valid_res[0]:
                     proxyes.append(valid_res[1])
         logger.info('抓取 xicidaili 第 %d 页,有效代理 %d 个'%(page, len(proxyes)))
         page += 1
     return proxyes
Exemple #2
0
 def scheduler(self):
     """根据状态码实现调度"""
     deal_code = [
         '10000', '10001', '10003', '10004', '10016', '10020', '10021',
         '10022', '10023'
     ]
     pass_code = ['20800', '20801', '20802', '20803', '20003']
     self.status_dict = {
         "10000": self.status_ok,
         "10001": self.status_change_key,
         "10003": self.status_change_key,
         "10004": self.status_change_user_agent,
         "10010": self.status_change_proxy,
         "10016": self.status_change_user_agent,
         "10020": self.status_change_key,
         "10021": self.status_change_proxy,
         "10022": self.status_change_proxy,
         "10023": self.status_change_key
     }
     status = self.respond['status']
     infocode = self.respond['infocode']
     # print(infocode)
     if infocode in deal_code:
         return self.status_dict[infocode]()
     elif infocode in pass_code:
         logger.info('出现 %s 跳过的网址 %s' % (infocode, self.req_url))
         self.status_pass()
     else:
         print(infocode)
         logger.info(infocode)
         self.status_invalid_request()
Exemple #3
0
 def get_count(self):
     self.requestor()
     count = self.respond.get("count")
     if count:
         return int(count)
     else:
         logger.info('没有count字段的网址是 %s' % self.req_url)
         return 0
Exemple #4
0
 def status_sleep_try(self):
     if self.repeat_times <= 10:
         time.sleep(5)
         print('=====================休息5秒======================')
         self.repeat_times += 1
         return self.process()
     else:
         logger.info("重试超过 10 次, 跳过 %s" % self.req.url)
         self.status_pass()
Exemple #5
0
 def status_ok(self):
     results = self.respond.get('results')
     if results:
         res_list = []
         for i in results:
             res_list.append(self.parser(i))
         print('%s 地址获取成功' % self.params['address'])
         return res_list
     else:
         logger.info('结果为空 %s' % self.req.url)
Exemple #6
0
 def scheduler(self):
     if isinstance(self.respond, dict):
         if self.respond.get("Error"):
             error_info = self.respond["Error"]["Message"]
             self.req_stat(error_info)
             logger.info('%s %s-%s %s' %
                         (self.DDate, self.Dcity, self.Acity, error_info))
         else:
             return self.parser()
     else:
         return self.requestor()
 def parser(self):
     # print(self.respond)
     if self.respond:
         df = pd.DataFrame(self.respond)
         df['city'] = self.params['city_name']
         df['date'] = self.params['date_end']
         # res_list = df.to_dict('records')
         return df
     else:
         print(self.req_url)
         logger.info('%s 没有数据' % self.params['city_name'])
Exemple #8
0
 def status_ok(self):
     all_res = []
     pois = self.respond.get('pois')
     if pois:
         for poi in pois:
             if poi:
                 all_res.append(self.parser(poi))
             else:
                 logger.info('没有值的连接是 %s' % self.req_url)
         return all_res
     else:
         logger.info('没有值的连接是 %s' % self.req_url)
Exemple #9
0
 def proxy_vaild(self, proxy_dict):
     url = "http://ip.chinaz.com/getip.aspx"  #用来测试IP是否可用的url
     try:
         r = requests.get(url, proxies=proxy_dict, headers=self.headers, timeout=3, allow_redirects = False)
         if r.status_code == 200 and r.text != Fetch_proxy.local_ip:
             print(r.text)
             return (True, proxy_dict)
         else:
             logger.info('_______%s 无效代理________'%r.status_code)
             return (False, )
     except (req_e.ReadTimeout, req_e.ConnectTimeout, req_e.ProxyError,req_e.ConnectionError,req_e.ChunkedEncodingError):
         logger.info('_______连接超时 无效代理________')
         return (False, )
Exemple #10
0
 def status_ok(self):
     if 'next_page_token' not in self.respond:
         results = self.respond.get('results')
         if results:
             res_list = []
             for i in results:
                 res_list.append(self.parser(i))
             print('%s %s 采集成功' %
                   (self.params['location'], self.params['types']))
             return res_list
         else:
             logger.info('结果为空 %s' % self.req_url)
     else:
         return '结果超出20个'
Exemple #11
0
 def expand(self, distance, lng1=0, lat1=0, lng2=0, lat2=0):
     # distance 单位为m
     lng_per_meter = 0.00001141
     lat_per_meter = 0.00000899
     ex_lng1 = round([(self.lng1 - (lng_per_meter * distance)),
                      lng1][lng1 != 0], 6)
     ex_lng2 = round([(self.lng2 + (lng_per_meter * distance)),
                      lng2][lng2 != 0], 6)
     ex_lat1 = round([(self.lat1 - (lat_per_meter * distance)),
                      lat1][lat1 != 0], 6)
     ex_lat2 = round([(self.lat2 + (lat_per_meter * distance)),
                      lat2][lat2 != 0], 6)
     logger.info('拓展矩形为(%s,%s,%s,%s)' %
                 (ex_lng1, ex_lat1, ex_lng2, ex_lat2))
     return Rectangle(ex_lng1, ex_lat1, ex_lng2, ex_lat2)
Exemple #12
0
 def fetch_new_proxyes(self, num):
     crawls = [self.fetch_ip181, self.fetch_66ip, self.fetch_xici, self.fetch_kxdaili]
     valid_proxyes = []
     demand_num = num
     for crawl in crawls:
         new_proxyes = crawl(demand_num)
         logger.info('_______抓取新代理%s________'%len(new_proxyes))
         valid_proxyes += new_proxyes
         demand_num -= len(new_proxyes)
         if demand_num <= 0:
             logger.info('_______代理抓取完毕,共%s________'%len(valid_proxyes))
             # self.save_proxy(valid_proxyes)
             break
         else:
             continue
     return valid_proxyes
Exemple #13
0
    def scheduler_by_statuscode(self, status_code):
        """根据网络状态码进行调度"""
        if status_code == 200:
            try:
                self.respond = self.req.json()
                if self.proxys['proxies']:
                    # 回收未用完的代理
                    Fetch_proxy.proxy_pool.append(self.proxys['proxies'])

            except:
                content = self.req.text
                while ",," in content:
                    content = content.replace(',,', ',"",')
                while "[," in content:
                    content = content.replace("[,", '["",')
                try:
                    content = eval(content)
                except:
                    pass
                if isinstance(content, list):
                    self.respond = content
                else:
                    # logger.info(content)
                    self._respond = content
                    self.respond = None

        elif status_code in [301, 302, 429, 302, 502, 403]:
            self.status_change_proxy()

        elif status_code in [400, 401, 402, 404]:
            logger.info('%s_%s 没有信息' % (self.url, status_code))
            self.respond = None

        elif status_code in [202, 204]:
            print(status_code)
            time.sleep(2)
            self.status_change_user_agent()

        elif status_code in [500]:
            print(status_code)
            self.status_change_user_agent()

        else:
            print(status_code)
            self.status_change_user_agent()
Exemple #14
0
 def fetch_66ip(self, num):
     """抓取http://www.66ip.cn/,质量25%"""
     proxyes = []
     url = "http://www.66ip.cn/nmtq.php?getnum=100&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip"
     req = requests.get(url, headers=self.headers)
     html = req.text
     urls = html.split("</script>")[1].split("<br />")
     for u in urls[:-1]:
         if u.strip():
             proxy = u.strip()
             proxy_dict = {'http':proxy, 'https':proxy}
             valid_res = self.proxy_vaild(proxy_dict)
             if valid_res[0]:
                     proxyes.append(valid_res[1])
             if len(proxyes) >= num:
                 break
             else:
                 continue
     logger.info('抓取 66ip,有效代理 %d 个'%(len(proxyes)))
     return proxyes
Exemple #15
0
 def fetch_ip181(self, num):
     """抓取http://www.ip181.com/,10分钟更新100个,质量55%"""
     proxyes = []
     url = 'http://www.ip181.com/'
     req = requests.get(url, headers=self.headers)
     html = req.text
     selector = etree.HTML(html)
     tbody = selector.xpath('//tr')
     for line in tbody[1:]:
         tds = line.xpath('td/text()')
         ip = tds[0]
         port = tds[1]
         latency = tds[4].split(' ')[0]
         if float(latency) < 0.5:
             proxy = "%s:%s"%(ip, port)
             proxy_dict = {'http':proxy, 'https':proxy}
             valid_res = self.proxy_vaild(proxy_dict)
             if valid_res[0]:
                 proxyes.append(valid_res[1])
             if len(proxyes) >= num:
                 break
     logger.info('抓取 ip181,有效代理 %d 个'%(len(proxyes)))
     return proxyes
Exemple #16
0
 def fetch_kxdaili(self, num):
     """抓取http://www.kxdaili.com/,质量 5%"""
     page = 1
     proxyes = []
     while len(proxyes) <= num and page <= 10:
         url = "http://www.kxdaili.com/dailiip/1/%d.html" % page
         req = requests.get(url,headers=self.headers)
         html = req.text
         selector = etree.HTML(html)
         tbody = selector.xpath('//tr')
         for line in tbody:
             tds = line.xpath('td/text()')
             ip = tds[0]
             port = tds[1]
             latency = tds[4].split(' ')[0]
             if float(latency) < 0.5:
                 proxy = "%s:%s"%(ip, port)
                 proxy_dict = {'http':proxy, 'https':proxy}
                 valid_res = self.proxy_vaild(proxy_dict)
                 if valid_res[0]:
                     proxyes.append(valid_res[1])
         logger.info('抓取 kxdaili 第 %d 页,有效代理 %d 个'%(page, len(proxyes)))
         page += 1
     return proxyes
Exemple #17
0
 def parser(self, json_dict):
     print(json_dict)
     datas = json_dict.get('data')
     codes = json_dict.get('code')
     if codes == 0 and len(datas) != 0:
         points = []
         min_count = datas[0]['count']
         for i in datas:
             min_count = min(i['count'], min_count)
         for i in datas:
             point = {}
             gcj_lng = 1e-6 * (250.0 * i['grid_x'] + 125.0)
             gcj_lat = 1e-6 * (250.0 * i['grid_y'] + 125.0)
             point[
                 'gcj_lng'] = gcj_lng  # 此处的算法在宜出行网页后台的js可以找到,文件路径是http://c.easygo.qq.com/eg_toc/js/map-55f0ea7694.bundle.js
             point['gcj_lat'] = gcj_lat
             point['lng'], point[
                 'lat'] = transCoordinateSystem.gcj02_to_wgs84(
                     gcj_lng, gcj_lat)
             point['count'] = i['count'] / min_count
             point['req_time'] = datetime.datetime.now().strftime(
                 '%Y-%m-%d %H:%M:%S')
             points.append(point)
         Easygo_Clawer.cookies = self.cookies
         return points
     elif codes == 0 and len(datas) == 0:
         print("此区域没有点信息")
         logger.info("此区域没有点信息 %s" % self.req_url)
     elif codes == 3:
         logger.info("%s 账号需要验证" % self.qq_account)
         time.sleep(3)
         self.cookies = self.get_cookie()
         points = self.process()
         Easygo_Clawer.req_num = 1
         return points
     elif codes == -100:
         logger.info("%s 账号已用完" % self.qq_account)
         time.sleep(3)
         self.cookies = self.get_cookie()
         points = self.process()
         Easygo_Clawer.req_num = 1
         return points
     else:
         print(json_dict)
         logger.info("%s 账号出现未知错误" % self.qq_account)
Exemple #18
0
 def status_pass(self):
     logger.info('已跳过 %s' % self.req.url)
Exemple #19
0
 def status_change_key(self):
     logger.info('更换密钥 %s' % self.req.url)
     self.params.update_key()
     return self.process()
Exemple #20
0
        "从化区": [Rectangle(113.2738078, 23.37099304, 114.0565605, 23.93695479)],
        "番禺区": [Rectangle(113.2429326, 22.87177748, 113.5533215, 23.08258251)],
        "海珠区": [Rectangle(113.2333014, 23.04533721, 113.4122732, 23.11366537)],
        "花都区": [Rectangle(112.9540515, 23.24907373, 113.4694197, 23.61688869)],
        "荔湾区": [Rectangle(113.1706897, 23.0442161, 113.2693343, 23.15839047)],
        "黄埔区": [Rectangle(113.389631, 23.03409065, 113.6017962, 23.42672447)],
        "南沙区": [Rectangle(113.2911038, 22.56227328, 113.6843494, 22.90920969)],
        "天河区": [Rectangle(113.2922662, 23.09766052, 113.4391771, 23.24457675)],
        "越秀区": [Rectangle(113.2323543, 23.10463126, 113.3178628, 23.17175286)],
        "增城区": [Rectangle(113.5406707, 23.08627615, 113.9949777, 23.62208945)]
    }
    start_time = datetime.datetime.now().strftime('%y-%m-%d %I:%M:%S %p')
    info_dict = {
        '名称': 'Google POI 抓取工具V1.0',
        '邮箱': '*****@*****.**',
        '起始时间': start_time,
        '终止时间': '20180401'
    }
    logger.info(param_info(info_dict))
    for region_name, rect_list in rect_dict.items():
        main(region_name, rect_list)
    email_alarm = Email_alarm()
    end_time = datetime.datetime.now().strftime('%y-%m-%d %I:%M:%S %p')
    info_dict = {
        '名称': 'Google POI 抓取工具V1.0',
        '邮箱': '*****@*****.**',
        '起始时间': start_time,
        '终止时间': end_time
    }
    email_alarm.send_mail(param_info(info_dict))
Exemple #21
0
 def save_proxy(self, res_list):
     df = pd.DataFrame(res_list)
     df.to_csv('proxy.csv')
     logger.info('_______代理已储存________')
Exemple #22
0
 def status_invalid_request(self):
     logger.info('请求错误 %s' % self.req.url)
Exemple #23
0
 def status_unknown_error(self):
     logger.info('未知错误 %s' % self.req.url)
def param_info(info_dict):
    info_table = prettytable.PrettyTable(['项目', '描述'])
    for key in list(info_dict.keys()):
        info_table.add_row([key, info_dict[key]])
    info_table.align = 'l'
    logger.info('\n' + str(info_table))