def remove_ineffective_proxy(proxy): url = random.choice(validate_urls) try: res = requests.get(url, headers=headers, proxies=proxy, timeout=3) print(proxy, 'ok') except: host, port = re.findall(r'//(.+):(\d+)', proxy['http'])[0] check_str = '{}|{}'.format(host, port) RedisClient.remove_proxy(check_str) print(proxy, 'fail')
def multiprocess_filter_proxy(): pool = multiprocessing.Pool(multiprocessing.cpu_count()) print(RedisClient.num_proxy_nofilter()) for _ in range(RedisClient.num_proxy_nofilter()): item = RedisClient.pop_proxy_nofilter() ip, port = item.split('|') pool.apply_async(filter_proxy, args=(ip, port)) pool.close() pool.join() print('over')
def filter_proxy(ip, port): proxy = { 'http': 'http://{}:{}'.format(ip, port), 'https': 'https://{}:{}'.format(ip, port), } validate = Validator() result = validate.validate_proxy(proxy=proxy) # print(result, ip, port) if result == 4: # 将结果存入redis RedisClient.add_proxy(check_str='{}|{}'.format(ip, port), value=result)
def crawl_66ip(): """ 66ip 代理:http://www.66ip.cn """ url = 'http://www.66ip.cn/nmtq.php?getnum=100&isp=0&anonymoustype=4&start=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip' res = redefine_requests(url) if not res: return items = re.findall(pattern=r'\d+\.\d+\.\d+\.\d+\:\d+', string=res.text) for item in items: ip, port = item.split(':') RedisClient.add_proxy_nofilter(check_str='{}|{}'.format(ip, port))
def __init__(self): self.db = RedisClient() # self.db.recheck() # self.db.clear_all() self.crawlers = [xiciproxy.xiciproxy() ] #,xiciproxy.xiciproxy()_66proxy._66proxy(), self.check_task = threading.Thread(target=self.check_useful_task) self.check_task.daemon = True self.check_task.start() self.check_task = threading.Thread(target=self.interface) self.check_task.daemon = True self.check_task.start() self.update()
def __init__(self, bind_ip, bind_port, process_id): self.bind_ip = bind_ip self.bind_port = bind_port self.process_id = process_id self.nid = get_rand_id() # nodes 节点是一个双端队列 self.nodes = deque(maxlen=MAX_NODE_QSIZE) # KRPC 协议是由 bencode 编码组成的一个简单的 RPC 结构,使用 UDP 报文发送。 self.udp = socket.socket(socket.AF_INET, socket.SOCK_DGRAM, socket.IPPROTO_UDP) # UDP 地址绑定 self.udp.bind((self.bind_ip, self.bind_port)) # redis 客户端 self.rc = RedisClient() self.logger = get_logger("logger_{}".format(bind_port))
class AsyncAlarmThread(threading.Thread): def __init__(self, **conf): threading.Thread.__init__(self) self._stop = threading.Event() self.rs = RedisClient(db='alert', **conf) def stop(self): self._stop.set() def run(self): while not self._stop.isSet(): self.errors = self.rs.hgetall('total:alarm:errors') message = dict() for item, errors in self.errors.items(): item = item.split('|')[0] error = json.loads(errors) for e in error: try: status, info, product, idc, service, black = e.split('|') if black == 'is_black': continue title = item content = '%s,%s,%s,%s,%s' % (status, info, product, idc, service) content = content.replace(',,,', ',') except Exception as e: continue if title not in message:message[title] = list() message[title].append(content) XMPPBOT.MESSAGE = message for i in xrange(30): if not self._stop.isSet(): sleep(1)
def crawl_ip3366(): """ 云代理:http://www.ip3366.net """ for page in range(1, 6): url = "http://www.ip3366.net/?stype=1&page={}".format(page) res = redefine_requests(url) if not res: return soup = BeautifulSoup(res.text, 'lxml') items = soup.select('#list table tbody tr') for item in items: tds = item.select('td') ip = tds[0].text.strip() port = tds[1].text.strip() RedisClient.add_proxy_nofilter(check_str='{}|{}'.format(ip, port))
def crawl_kuaidaili(): """ 快代理:https://www.kuaidaili.com """ for page in range(1, 6): url = "https://www.kuaidaili.com/free/inha/{}/".format(page) res = redefine_requests(url) if not res: continue soup = BeautifulSoup(res.text, 'lxml') items = soup.select('#list table tbody tr') for item in items: ip = item.find('td', {'data-title': 'IP'}).text.strip() port = item.find('td', {'data-title': 'PORT'}).text.strip() RedisClient.add_proxy_nofilter(check_str='{}|{}'.format(ip, port))
def crawl_swei360(): """ 360 代理:http://www.swei360.com """ for page in range(1, 2): for style in [1, 3]: print(page,style) url = "http://www.swei360.com/free/?stype={}&page={}".format(style, page) res = redefine_requests(url,timeout=20) if not res: continue soup = BeautifulSoup(res.text, 'lxml') items = soup.select('#list table tbody tr') for item in items: tds = item.select('td') ip = tds[0].text.strip() port = tds[1].text.strip() RedisClient.add_proxy_nofilter(check_str='{}|{}'.format(ip, port))
def popular(): r = RedisClient.get_instance(dev=False) pop = r.hgetall('popular') sorted_searches = sorted(pop.items(), key=lambda x:int(x[1]), reverse=True)[0:10] final_dict = {} for sorted_search in sorted_searches: final_dict[sorted_search[0].decode('utf-8')] = int(sorted_search[1].decode('utf-8')) return jsonify(final_dict)
def crawl_data5u(): """ 无忧代理:http://www.data5u.com/ """ urls = ["http://www.data5u.com/free/gwgn/index.shtml", 'http://www.data5u.com/free/gwpt/index.shtml', 'http://www.data5u.com/free/gwpt/index.shtml', 'http://www.data5u.com/free/gwpt/index.shtml', 'http://www.data5u.com/free/index.shtml' ] for url in urls: res = redefine_requests(url) if not res: return soup = BeautifulSoup(res.text, 'lxml') items = soup.find_all('ul', class_='l2') for item in items: spans = item.find_all('span') ip = spans[0].text.strip() port = spans[1].text.strip() RedisClient.add_proxy_nofilter(check_str='{}|{}'.format(ip, port))
def crawl_xici(): """ 西刺代理:http://www.xicidaili.com """ urls = ['http://www.xicidaili.com/nn/', 'http://www.xicidaili.com/nt/', 'http://www.xicidaili.com/wn/', 'http://www.xicidaili.com/wt/' ] for url in urls: for page in range(1, 2): res =redefine_requests('{}{}'.format(url, page)) if not res: continue soup = BeautifulSoup(res.text, 'lxml') # print(soup) # time.sleep(1000) items = soup.select('#ip_list .odd') for item in items: tds = item.select('td') ip = tds[1].text.strip() port = tds[2].text.strip() RedisClient.add_proxy_nofilter(check_str='{}|{}'.format(ip, port))
def run(): """ 定期验证代理是否还有效,删除失效的代理 :return: """ while True: pool = multiprocessing.Pool(multiprocessing.cpu_count()) proxies = RedisClient.get_all_proxies() print(len(proxies)) for proxy in proxies: pool.apply_async(remove_ineffective_proxy, args=(proxy, )) pool.close() pool.join() time.sleep(10)
def search(): data = json.loads(request.data.decode('utf-8')) query = data['q'] prefs = data['prefs'] es = ElasticStorage.get_instance(dev=False) r = RedisClient.get_instance(dev=False) if r.hexists('popular', query.lower()): r.hincrby('popular', query.lower()) else: r.hset('popular', query.lower(), 1) articles = es.query_articles(query, prefs) articles = list(articles) articles = list({article['title']:article for article in articles}.values()) for article in articles: for key, value in source_map.items(): if key in article['url']: article['source'] = value return jsonify( articles=articles )
class ProxyPool(object): MAX_ERROR_NUM = 2 def __init__(self): self.db = RedisClient() # self.db.recheck() # self.db.clear_all() self.crawlers = [xiciproxy.xiciproxy() ] #,xiciproxy.xiciproxy()_66proxy._66proxy(), self.check_task = threading.Thread(target=self.check_useful_task) self.check_task.daemon = True self.check_task.start() self.check_task = threading.Thread(target=self.interface) self.check_task.daemon = True self.check_task.start() self.update() def update(self): threading.Thread(target=self.get_data).start() def get_data(self): for crawler in self.crawlers: self.db.adds_temp_buffer(crawler.start()) def start_one_check(self, check_buffer, eval_data, type): assert type == 'http' or type == 'https', 'type 为 http或者https' if type == 'http': msgs = [ (requests.get, queue.Queue(), 'http://www.qq.com', { 'timeout': 5, 'proxies': { 'http': eval_data[0] } }), ] check_buffer.append(( eval_data, msgs[0][1], 'http', )) else: msgs = [ (requests.get, queue.Queue(), 'https://www.baidu.com', { 'timeout': 5, 'proxies': { 'https': eval_data[0] } }), ] check_buffer.append((eval_data, msgs[0][1], 'https')) return msgs def check_useful_task(self): check_buffer = [] count = 0 while True: count += 1 data = self.db.pop_temp_buffer() if data: eval_data = eval(data) if eval_data[4] < self.MAX_ERROR_NUM and eval_data[ 5] < self.MAX_ERROR_NUM: if count // 2: msgs = self.start_one_check(check_buffer, eval_data, 'http') else: msgs = self.start_one_check(check_buffer, eval_data, 'https') else: if eval_data[4] < self.MAX_ERROR_NUM: msgs = self.start_one_check(check_buffer, eval_data, 'http') elif eval_data[5] < self.MAX_ERROR_NUM: msgs = self.start_one_check(check_buffer, eval_data, 'https') else: continue downloader(msgs) else: for item in check_buffer: if not item[1].empty(): res = item[1].get() if res: res.encoding = res.apparent_encoding if item[2] == 'http': if re.findall("<title>腾讯首页</title>", res.text): item[0][4] = self.MAX_ERROR_NUM self.db.adds_http_pool( ((*item[0][:4], 0, item[0][6]), )) else: item[0][4] += 1 elif item[2] == 'https': if re.findall("<title>百度一下,你就知道</title>", res.text): item[0][5] = self.MAX_ERROR_NUM self.db.adds_https_pool( ((*item[0][:4], 0, item[0][6]), )) else: item[0][5] += 1 else: if item[2] == 'http': item[0][4] += 1 elif item[2] == 'https': item[0][5] += 1 if item[0][4] < self.MAX_ERROR_NUM or item[0][ 5] < self.MAX_ERROR_NUM: self.db.adds_temp_buffer((item[0], )) check_buffer.remove(item) time.sleep(0.01) def interface(self): while True: instr = input('>>>') if instr == 'exit': sys.exit() elif instr == 'get_http_one': print(self.db.get_http_one()) elif instr == 'get_https_one': print(self.db.get_https_one()) else: print('请重新输入')
#!/usr/bin/env python # coding=utf-8 from sanic import Sanic from sanic.response import json, html import os import sys base_dir = os.path.join(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, base_dir) from database import RedisClient app = Sanic() redis_conn = RedisClient() @app.route("/") async def index(request): return html('<h2>Welcome to Proxy Pool System</h2>') @app.route("/pop") async def pop_proxy(request): proxy = await redis_conn.pop_proxy() if proxy: proxy = proxy.decode('utf-8') if proxy[:5] == "https": return json({"https": proxy}) else: return json({"http": proxy})
def __init__(self): self.redis = RedisClient()
def __init__(self, **conf): threading.Thread.__init__(self) self._stop = threading.Event() self.rs = RedisClient(db='alert', **conf)
class DHTServer: def __init__(self, bind_ip, bind_port, process_id): self.bind_ip = bind_ip self.bind_port = bind_port self.process_id = process_id self.nid = get_rand_id() # nodes 节点是一个双端队列 self.nodes = deque(maxlen=MAX_NODE_QSIZE) # KRPC 协议是由 bencode 编码组成的一个简单的 RPC 结构,使用 UDP 报文发送。 self.udp = socket.socket(socket.AF_INET, socket.SOCK_DGRAM, socket.IPPROTO_UDP) # UDP 地址绑定 self.udp.bind((self.bind_ip, self.bind_port)) # redis 客户端 self.rc = RedisClient() self.logger = get_logger("logger_{}".format(bind_port)) def bootstrap(self): """ 利用 tracker 服务器,伪装成 DHT 节点,加入 DHT 网络 """ for address in BOOTSTRAP_NODES: self.send_find_node(address) def bs_timer(self): """ 定时执行 bootstrap() """ t = 1 while True: if t % PER_SEC_BS_TIMER == 0: t = 1 self.bootstrap() t += 1 time.sleep(1) def send_krpc(self, msg, address): """ 发送 krpc 协议 :param msg: 发送 UDP 报文信息 :param address: 发送地址,(ip, port) 元组 """ try: # msg 要经过 bencode 编码 self.udp.sendto(bencoder.bencode(msg), address) except: pass def send_error(self, tid, address): """ 发送错误回复 """ msg = dict(t=tid, y="e", e=[202, "Server Error"]) self.send_krpc(msg, address) def send_find_node(self, address, nid=None): """ 发送 find_node 请求。 `find_node 请求` find_node 被用来查找给定 ID 的节点的联系信息。这时 KPRC 协议中的 "q" == "find_node"。find_node 请求包含 2 个参数,第一个参数是 id, 包含了请求节点的 ID。第二个参数是 target,包含了请求者正在查找的 节点的 ID。当一个节点接收到了 find_node 的请求,他应该给出对应的 回复,回复中包含 2 个关键字 id 和 nodes,nodes 是字符串类型, 包含了被请求节点的路由表中最接近目标节点的 K(8) 个最接近的节点的联系信息。 `示例` 参数: {"id" : "<querying nodes id>", "target" : "<id of target node>"} 回复: {"id" : "<queried nodes id>", "nodes" : "<compact node info>"} :param address: 地址元组(ip, port) :param nid: 节点 id """ nid = get_neighbor(nid) if nid else self.nid tid = get_rand_id() msg = dict( t=tid, y="q", q="find_node", # 指定请求为 find_node a=dict(id=nid, target=get_rand_id()), ) self.send_krpc(msg, address) def send_find_node_forever(self): """ 循环发送 find_node 请求 """ self.logger.info("send find node forever...") while True: try: # 弹出一个节点 node = self.nodes.popleft() self.send_find_node((node.ip, node.port), node.nid) time.sleep(SLEEP_TIME) except IndexError: # 一旦节点队列为空,则重新加入 DHT 网络 self.bootstrap() def save_magnet(self, info_hash): """ 将磁力链接保存到数据库 :param info_hash: 磁力链接的 info_hash """ # 使用 codecs 解码 info_hash hex_info_hash = codecs.getencoder("hex")(info_hash)[0].decode() magnet = MAGNET_PER.format(hex_info_hash) self.rc.add_magnet(magnet) # self.logger.info("pid " + str(self.process_id) + " - " + magnet) self.logger.info("pid_{0} - {1}".format(self.process_id, magnet)) def on_message(self, msg, address): """ 负责返回信息的处理 :param msg: 报文信息 :param address: 报文地址 """ try: # `回复` # 对应于 KPRC 消息字典中的 y 关键字的值是 r,包含了一个附加的关键字 r。 # 关键字 r 是字典类型,包含了返回的值。发送回复消息是在正确解析了请求消息的 # 基础上完成的。 if msg[b"y"] == b"r": # nodes 是字符串类型,包含了被请求节点的路由表中最接近目标节点 # 的 K个最接近的节点的联系信息。 if msg[b"r"].get(b"nodes", None): self.on_find_node_response(msg) # `请求` # 对应于 KPRC 消息字典中的 y 关键字的值是 q,它包含 2 个附加的关键字 # q 和 a。关键字 q 是字符串类型,包含了请求的方法名字。关键字 a 一个字典 # 类型包含了请求所附加的参数。 # 而实际上我们只需要获取这两者中的 info hash,用于构造磁力链接进而获取种子。 elif msg[b"y"] == b"q": # get_peers 与 torrent 文件的 info_hash 有关。这时 KPRC 协议中的 # "q" = "get_peers"。get_peers 请求包含 2 个参数。第一个参数是 id, # 包含了请求节点的 ID。第二个参数是 info_hash,它代表 torrent 文件的 info_hash if msg[b"q"] == b"get_peers": self.on_get_peers_request(msg, address) # announce_peer 表明请求的节点正在某个端口下载 torrent # 文件。announce_peer 包含 4 个参数。第一个参数是 id,包含了请求节点的 ID; # 第二个参数是 info_hash,包含了 torrent 文件的 info_hash;第三个参数是 port # 包含了整型的端口号,表明 peer 在哪个端口下载;第四个参数数是 token, # 这是在之前的 get_peers 请求中收到的回复中包含的。 elif msg[b"q"] == b"announce_peer": self.on_announce_peer_request(msg, address) except KeyError: pass def on_find_node_response(self, msg): """ 解码 nodes 节点信息,并存储在双端队列 :param msg: 节点报文信息 """ nodes = get_nodes_info(msg[b"r"][b"nodes"]) for node in nodes: nid, ip, port = node # 进行节点有效性判断 if len(nid) != PER_NID_LEN or ip == self.bind_ip: continue # 将节点加入双端队列 self.nodes.append(HNode(nid, ip, port)) def on_get_peers_request(self, msg, address): """ 处理 get_peers 请求,获取 info hash :param msg: 节点报文信息 :param address: 节点地址 """ tid = msg[b"t"] try: info_hash = msg[b"a"][b"info_hash"] self.save_magnet(info_hash) except KeyError: # 没有对应的 info hash,发送错误回复 self.send_error(tid, address) def on_announce_peer_request(self, msg, address): """ 处理 get_announce 请求,获取 info hash,address, port 本爬虫目的暂时只是爬取磁链,所以忽略 address, port 有需要的 开发者可自行完善这部分内容 :param msg: 节点报文信息 :param address: 节点地址 """ tid = msg[b"t"] try: info_hash = msg[b"a"][b"info_hash"] self.save_magnet(info_hash) except KeyError: # 没有对应的 info hash,发送错误回复 self.send_error(tid, address) def receive_response_forever(self): """ 循环接受 udp 数据 """ self.logger.info("receive response forever {}:{}".format( self.bind_ip, self.bind_port)) # 首先加入到 DHT 网络 self.bootstrap() while True: try: # 接受返回报文 data, address = self.udp.recvfrom(UDP_RECV_BUFFSIZE) # 使用 bdecode 解码返回数据 msg = bencoder.bdecode(data) # 处理返回信息 self.on_message(msg, address) time.sleep(SLEEP_TIME) except Exception as e: self.logger.warning(e)