def remove_ineffective_proxy(proxy):
    url = random.choice(validate_urls)
    try:
        res = requests.get(url, headers=headers, proxies=proxy, timeout=3)
        print(proxy, 'ok')
    except:
        host, port = re.findall(r'//(.+):(\d+)', proxy['http'])[0]
        check_str = '{}|{}'.format(host, port)
        RedisClient.remove_proxy(check_str)
        print(proxy, 'fail')
def multiprocess_filter_proxy():
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    print(RedisClient.num_proxy_nofilter())
    for _ in range(RedisClient.num_proxy_nofilter()):
        item = RedisClient.pop_proxy_nofilter()
        ip, port = item.split('|')
        pool.apply_async(filter_proxy, args=(ip, port))
    pool.close()
    pool.join()
    print('over')
def filter_proxy(ip, port):
    proxy = {
        'http': 'http://{}:{}'.format(ip, port),
        'https': 'https://{}:{}'.format(ip, port),
    }
    validate = Validator()
    result = validate.validate_proxy(proxy=proxy)
    # print(result, ip, port)
    if result == 4:
        # 将结果存入redis
        RedisClient.add_proxy(check_str='{}|{}'.format(ip, port), value=result)
 def crawl_66ip():
     """
     66ip 代理:http://www.66ip.cn
     """
     url = 'http://www.66ip.cn/nmtq.php?getnum=100&isp=0&anonymoustype=4&start=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip'
     res = redefine_requests(url)
     if not res:
         return
     items = re.findall(pattern=r'\d+\.\d+\.\d+\.\d+\:\d+', string=res.text)
     for item in items:
         ip, port = item.split(':')
         RedisClient.add_proxy_nofilter(check_str='{}|{}'.format(ip, port))
 def __init__(self):
     self.db = RedisClient()
     # self.db.recheck()
     # self.db.clear_all()
     self.crawlers = [xiciproxy.xiciproxy()
                      ]  #,xiciproxy.xiciproxy()_66proxy._66proxy(),
     self.check_task = threading.Thread(target=self.check_useful_task)
     self.check_task.daemon = True
     self.check_task.start()
     self.check_task = threading.Thread(target=self.interface)
     self.check_task.daemon = True
     self.check_task.start()
     self.update()
Exemple #6
0
 def __init__(self, bind_ip, bind_port, process_id):
     self.bind_ip = bind_ip
     self.bind_port = bind_port
     self.process_id = process_id
     self.nid = get_rand_id()
     # nodes 节点是一个双端队列
     self.nodes = deque(maxlen=MAX_NODE_QSIZE)
     # KRPC 协议是由 bencode 编码组成的一个简单的 RPC 结构,使用 UDP 报文发送。
     self.udp = socket.socket(socket.AF_INET, socket.SOCK_DGRAM,
                              socket.IPPROTO_UDP)
     # UDP 地址绑定
     self.udp.bind((self.bind_ip, self.bind_port))
     # redis 客户端
     self.rc = RedisClient()
     self.logger = get_logger("logger_{}".format(bind_port))
Exemple #7
0
class AsyncAlarmThread(threading.Thread):
    
    def __init__(self, **conf):
        threading.Thread.__init__(self)
        self._stop = threading.Event()
        self.rs = RedisClient(db='alert', **conf)

    def stop(self):
        self._stop.set()

    def run(self):
        while not self._stop.isSet():
            self.errors = self.rs.hgetall('total:alarm:errors')
            message = dict()
            for item, errors in self.errors.items():
                item = item.split('|')[0]
                error = json.loads(errors)
                for e in error:
                    try:
                        status, info, product, idc, service, black = e.split('|')
                        if black == 'is_black': continue
                        title = item
                        content = '%s,%s,%s,%s,%s' % (status, info, product, idc, service)
                        content = content.replace(',,,', ',')
                    except Exception as e:
                        continue
                    if title not in message:message[title] = list()
                    message[title].append(content)
            XMPPBOT.MESSAGE = message
            for i in xrange(30):
                if not self._stop.isSet():
                    sleep(1)
 def crawl_ip3366():
     """
     云代理:http://www.ip3366.net
     """
     for page in range(1, 6):
         url = "http://www.ip3366.net/?stype=1&page={}".format(page)
         res = redefine_requests(url)
         if not res:
             return
         soup = BeautifulSoup(res.text, 'lxml')
         items = soup.select('#list table tbody tr')
         for item in items:
             tds = item.select('td')
             ip = tds[0].text.strip()
             port = tds[1].text.strip()
             RedisClient.add_proxy_nofilter(check_str='{}|{}'.format(ip, port))
    def crawl_kuaidaili():
        """
        快代理:https://www.kuaidaili.com
        """
        for page in range(1, 6):
            url = "https://www.kuaidaili.com/free/inha/{}/".format(page)
            res = redefine_requests(url)
            if not res:
                continue

            soup = BeautifulSoup(res.text, 'lxml')
            items = soup.select('#list table tbody tr')
            for item in items:
                ip = item.find('td', {'data-title': 'IP'}).text.strip()
                port = item.find('td', {'data-title': 'PORT'}).text.strip()
                RedisClient.add_proxy_nofilter(check_str='{}|{}'.format(ip, port))
Exemple #10
0
 def crawl_swei360():
     """
     360 代理:http://www.swei360.com
     """
     for page in range(1, 2):
         for style in [1, 3]:
             print(page,style)
             url = "http://www.swei360.com/free/?stype={}&page={}".format(style, page)
             res = redefine_requests(url,timeout=20)
             if not res:
                 continue
             soup = BeautifulSoup(res.text, 'lxml')
             items = soup.select('#list table tbody tr')
             for item in items:
                 tds = item.select('td')
                 ip = tds[0].text.strip()
                 port = tds[1].text.strip()
                 RedisClient.add_proxy_nofilter(check_str='{}|{}'.format(ip, port))
Exemple #11
0
def popular():
    r = RedisClient.get_instance(dev=False)
    pop = r.hgetall('popular')
    sorted_searches = sorted(pop.items(), key=lambda x:int(x[1]), reverse=True)[0:10]

    final_dict = {}
    for sorted_search in sorted_searches:
        final_dict[sorted_search[0].decode('utf-8')] = int(sorted_search[1].decode('utf-8'))

    return jsonify(final_dict)
Exemple #12
0
 def crawl_data5u():
     """
     无忧代理:http://www.data5u.com/
     """
     urls = ["http://www.data5u.com/free/gwgn/index.shtml",
             'http://www.data5u.com/free/gwpt/index.shtml',
             'http://www.data5u.com/free/gwpt/index.shtml',
             'http://www.data5u.com/free/gwpt/index.shtml',
             'http://www.data5u.com/free/index.shtml'
             ]
     for url in urls:
         res = redefine_requests(url)
         if not res:
             return
         soup = BeautifulSoup(res.text, 'lxml')
         items = soup.find_all('ul', class_='l2')
         for item in items:
             spans = item.find_all('span')
             ip = spans[0].text.strip()
             port = spans[1].text.strip()
             RedisClient.add_proxy_nofilter(check_str='{}|{}'.format(ip, port))
Exemple #13
0
 def crawl_xici():
     """
     西刺代理:http://www.xicidaili.com
     """
     urls = ['http://www.xicidaili.com/nn/',
             'http://www.xicidaili.com/nt/',
             'http://www.xicidaili.com/wn/',
             'http://www.xicidaili.com/wt/'
             ]
     for url in urls:
         for page in range(1, 2):
             res =redefine_requests('{}{}'.format(url, page))
             if not res:
                 continue
             soup = BeautifulSoup(res.text, 'lxml')
             # print(soup)
             # time.sleep(1000)
             items = soup.select('#ip_list .odd')
             for item in items:
                 tds = item.select('td')
                 ip = tds[1].text.strip()
                 port = tds[2].text.strip()
                 RedisClient.add_proxy_nofilter(check_str='{}|{}'.format(ip, port))
def run():
    """
    定期验证代理是否还有效,删除失效的代理
    :return: 
    """
    while True:
        pool = multiprocessing.Pool(multiprocessing.cpu_count())
        proxies = RedisClient.get_all_proxies()
        print(len(proxies))
        for proxy in proxies:
            pool.apply_async(remove_ineffective_proxy, args=(proxy, ))
        pool.close()
        pool.join()
        time.sleep(10)
Exemple #15
0
def search():
    data = json.loads(request.data.decode('utf-8'))
    query = data['q']
    prefs = data['prefs']

    es = ElasticStorage.get_instance(dev=False)
    r = RedisClient.get_instance(dev=False)

    if r.hexists('popular', query.lower()):
        r.hincrby('popular', query.lower())
    else:
        r.hset('popular', query.lower(), 1)

    articles = es.query_articles(query, prefs)
    articles = list(articles)
    articles = list({article['title']:article for article in articles}.values())

    for article in articles:
        for key, value in source_map.items():
            if key in article['url']:
                article['source'] = value
    return jsonify(
        articles=articles
    )
Exemple #16
0
class ProxyPool(object):
    MAX_ERROR_NUM = 2

    def __init__(self):
        self.db = RedisClient()
        # self.db.recheck()
        # self.db.clear_all()
        self.crawlers = [xiciproxy.xiciproxy()
                         ]  #,xiciproxy.xiciproxy()_66proxy._66proxy(),
        self.check_task = threading.Thread(target=self.check_useful_task)
        self.check_task.daemon = True
        self.check_task.start()
        self.check_task = threading.Thread(target=self.interface)
        self.check_task.daemon = True
        self.check_task.start()
        self.update()

    def update(self):
        threading.Thread(target=self.get_data).start()

    def get_data(self):
        for crawler in self.crawlers:
            self.db.adds_temp_buffer(crawler.start())

    def start_one_check(self, check_buffer, eval_data, type):
        assert type == 'http' or type == 'https', 'type 为 http或者https'
        if type == 'http':
            msgs = [
                (requests.get, queue.Queue(), 'http://www.qq.com', {
                    'timeout': 5,
                    'proxies': {
                        'http': eval_data[0]
                    }
                }),
            ]
            check_buffer.append((
                eval_data,
                msgs[0][1],
                'http',
            ))
        else:
            msgs = [
                (requests.get, queue.Queue(), 'https://www.baidu.com', {
                    'timeout': 5,
                    'proxies': {
                        'https': eval_data[0]
                    }
                }),
            ]
            check_buffer.append((eval_data, msgs[0][1], 'https'))
        return msgs

    def check_useful_task(self):
        check_buffer = []
        count = 0
        while True:
            count += 1
            data = self.db.pop_temp_buffer()
            if data:
                eval_data = eval(data)
                if eval_data[4] < self.MAX_ERROR_NUM and eval_data[
                        5] < self.MAX_ERROR_NUM:
                    if count // 2:
                        msgs = self.start_one_check(check_buffer, eval_data,
                                                    'http')
                    else:
                        msgs = self.start_one_check(check_buffer, eval_data,
                                                    'https')
                else:
                    if eval_data[4] < self.MAX_ERROR_NUM:
                        msgs = self.start_one_check(check_buffer, eval_data,
                                                    'http')
                    elif eval_data[5] < self.MAX_ERROR_NUM:
                        msgs = self.start_one_check(check_buffer, eval_data,
                                                    'https')
                    else:
                        continue
                downloader(msgs)
            else:
                for item in check_buffer:
                    if not item[1].empty():
                        res = item[1].get()
                        if res:
                            res.encoding = res.apparent_encoding
                            if item[2] == 'http':
                                if re.findall("<title>腾讯首页</title>", res.text):
                                    item[0][4] = self.MAX_ERROR_NUM
                                    self.db.adds_http_pool(
                                        ((*item[0][:4], 0, item[0][6]), ))
                                else:
                                    item[0][4] += 1
                            elif item[2] == 'https':
                                if re.findall("<title>百度一下,你就知道</title>",
                                              res.text):
                                    item[0][5] = self.MAX_ERROR_NUM
                                    self.db.adds_https_pool(
                                        ((*item[0][:4], 0, item[0][6]), ))
                                else:
                                    item[0][5] += 1
                        else:
                            if item[2] == 'http':
                                item[0][4] += 1
                            elif item[2] == 'https':
                                item[0][5] += 1

                        if item[0][4] < self.MAX_ERROR_NUM or item[0][
                                5] < self.MAX_ERROR_NUM:
                            self.db.adds_temp_buffer((item[0], ))

                        check_buffer.remove(item)
                time.sleep(0.01)

    def interface(self):
        while True:
            instr = input('>>>')
            if instr == 'exit':
                sys.exit()
            elif instr == 'get_http_one':
                print(self.db.get_http_one())
            elif instr == 'get_https_one':
                print(self.db.get_https_one())
            else:
                print('请重新输入')
Exemple #17
0
#!/usr/bin/env python
# coding=utf-8

from sanic import Sanic
from sanic.response import json, html
import os
import sys

base_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, base_dir)

from database import RedisClient

app = Sanic()
redis_conn = RedisClient()


@app.route("/")
async def index(request):
    return html('<h2>Welcome to Proxy Pool System</h2>')


@app.route("/pop")
async def pop_proxy(request):
    proxy = await redis_conn.pop_proxy()
    if proxy:
        proxy = proxy.decode('utf-8')
    if proxy[:5] == "https":
        return json({"https": proxy})
    else:
        return json({"http": proxy})
 def __init__(self):
     self.redis = RedisClient()
Exemple #19
0
 def __init__(self, **conf):
     threading.Thread.__init__(self)
     self._stop = threading.Event()
     self.rs = RedisClient(db='alert', **conf)
Exemple #20
0
class DHTServer:
    def __init__(self, bind_ip, bind_port, process_id):
        self.bind_ip = bind_ip
        self.bind_port = bind_port
        self.process_id = process_id
        self.nid = get_rand_id()
        # nodes 节点是一个双端队列
        self.nodes = deque(maxlen=MAX_NODE_QSIZE)
        # KRPC 协议是由 bencode 编码组成的一个简单的 RPC 结构,使用 UDP 报文发送。
        self.udp = socket.socket(socket.AF_INET, socket.SOCK_DGRAM,
                                 socket.IPPROTO_UDP)
        # UDP 地址绑定
        self.udp.bind((self.bind_ip, self.bind_port))
        # redis 客户端
        self.rc = RedisClient()
        self.logger = get_logger("logger_{}".format(bind_port))

    def bootstrap(self):
        """
        利用 tracker 服务器,伪装成 DHT 节点,加入 DHT 网络
        """
        for address in BOOTSTRAP_NODES:
            self.send_find_node(address)

    def bs_timer(self):
        """
        定时执行 bootstrap()
        """
        t = 1
        while True:
            if t % PER_SEC_BS_TIMER == 0:
                t = 1
                self.bootstrap()
            t += 1
            time.sleep(1)

    def send_krpc(self, msg, address):
        """
        发送 krpc 协议

        :param msg: 发送 UDP 报文信息
        :param address: 发送地址,(ip, port) 元组
        """
        try:
            # msg 要经过 bencode 编码
            self.udp.sendto(bencoder.bencode(msg), address)
        except:
            pass

    def send_error(self, tid, address):
        """
        发送错误回复
        """
        msg = dict(t=tid, y="e", e=[202, "Server Error"])
        self.send_krpc(msg, address)

    def send_find_node(self, address, nid=None):
        """
        发送 find_node 请求。

        `find_node 请求`
        find_node 被用来查找给定 ID 的节点的联系信息。这时 KPRC 协议中的
        "q" == "find_node"。find_node 请求包含 2 个参数,第一个参数是 id,
        包含了请求节点的 ID。第二个参数是 target,包含了请求者正在查找的
        节点的 ID。当一个节点接收到了 find_node 的请求,他应该给出对应的
        回复,回复中包含 2 个关键字 id 和 nodes,nodes 是字符串类型,
        包含了被请求节点的路由表中最接近目标节点的 K(8) 个最接近的节点的联系信息。

        `示例`
        参数: {"id" : "<querying nodes id>", "target" : "<id of target node>"}
        回复: {"id" : "<queried nodes id>", "nodes" : "<compact node info>"}

        :param address: 地址元组(ip, port)
        :param nid: 节点 id
        """
        nid = get_neighbor(nid) if nid else self.nid
        tid = get_rand_id()
        msg = dict(
            t=tid,
            y="q",
            q="find_node",  # 指定请求为 find_node
            a=dict(id=nid, target=get_rand_id()),
        )
        self.send_krpc(msg, address)

    def send_find_node_forever(self):
        """
        循环发送 find_node 请求
        """
        self.logger.info("send find node forever...")
        while True:
            try:
                # 弹出一个节点
                node = self.nodes.popleft()
                self.send_find_node((node.ip, node.port), node.nid)
                time.sleep(SLEEP_TIME)
            except IndexError:
                # 一旦节点队列为空,则重新加入 DHT 网络
                self.bootstrap()

    def save_magnet(self, info_hash):
        """
        将磁力链接保存到数据库

        :param info_hash:  磁力链接的 info_hash
        """
        # 使用 codecs 解码 info_hash
        hex_info_hash = codecs.getencoder("hex")(info_hash)[0].decode()
        magnet = MAGNET_PER.format(hex_info_hash)
        self.rc.add_magnet(magnet)
        # self.logger.info("pid " + str(self.process_id) + " - " + magnet)
        self.logger.info("pid_{0} - {1}".format(self.process_id, magnet))

    def on_message(self, msg, address):
        """
        负责返回信息的处理

        :param msg: 报文信息
        :param address: 报文地址
        """
        try:
            # `回复`
            # 对应于 KPRC 消息字典中的 y 关键字的值是 r,包含了一个附加的关键字 r。
            # 关键字 r 是字典类型,包含了返回的值。发送回复消息是在正确解析了请求消息的
            # 基础上完成的。
            if msg[b"y"] == b"r":
                # nodes 是字符串类型,包含了被请求节点的路由表中最接近目标节点
                # 的 K个最接近的节点的联系信息。
                if msg[b"r"].get(b"nodes", None):
                    self.on_find_node_response(msg)
            # `请求`
            # 对应于 KPRC 消息字典中的 y 关键字的值是 q,它包含 2 个附加的关键字
            # q 和 a。关键字 q 是字符串类型,包含了请求的方法名字。关键字 a 一个字典
            # 类型包含了请求所附加的参数。
            # 而实际上我们只需要获取这两者中的 info hash,用于构造磁力链接进而获取种子。
            elif msg[b"y"] == b"q":
                # get_peers 与 torrent 文件的 info_hash 有关。这时 KPRC 协议中的
                # "q" = "get_peers"。get_peers 请求包含 2 个参数。第一个参数是 id,
                # 包含了请求节点的 ID。第二个参数是 info_hash,它代表 torrent 文件的 info_hash
                if msg[b"q"] == b"get_peers":
                    self.on_get_peers_request(msg, address)
                # announce_peer 表明请求的节点正在某个端口下载 torrent
                # 文件。announce_peer 包含 4 个参数。第一个参数是 id,包含了请求节点的 ID;
                # 第二个参数是 info_hash,包含了 torrent 文件的 info_hash;第三个参数是 port
                # 包含了整型的端口号,表明 peer 在哪个端口下载;第四个参数数是 token,
                # 这是在之前的 get_peers 请求中收到的回复中包含的。
                elif msg[b"q"] == b"announce_peer":
                    self.on_announce_peer_request(msg, address)
        except KeyError:
            pass

    def on_find_node_response(self, msg):
        """
        解码 nodes 节点信息,并存储在双端队列

        :param msg: 节点报文信息
        """
        nodes = get_nodes_info(msg[b"r"][b"nodes"])
        for node in nodes:
            nid, ip, port = node
            # 进行节点有效性判断
            if len(nid) != PER_NID_LEN or ip == self.bind_ip:
                continue
            # 将节点加入双端队列
            self.nodes.append(HNode(nid, ip, port))

    def on_get_peers_request(self, msg, address):
        """
        处理 get_peers 请求,获取 info hash

        :param msg: 节点报文信息
        :param address: 节点地址
        """
        tid = msg[b"t"]
        try:
            info_hash = msg[b"a"][b"info_hash"]
            self.save_magnet(info_hash)
        except KeyError:
            # 没有对应的 info hash,发送错误回复
            self.send_error(tid, address)

    def on_announce_peer_request(self, msg, address):
        """
        处理 get_announce 请求,获取 info hash,address, port
        本爬虫目的暂时只是爬取磁链,所以忽略 address, port 有需要的
        开发者可自行完善这部分内容

        :param msg: 节点报文信息
        :param address: 节点地址
        """
        tid = msg[b"t"]
        try:
            info_hash = msg[b"a"][b"info_hash"]
            self.save_magnet(info_hash)
        except KeyError:
            # 没有对应的 info hash,发送错误回复
            self.send_error(tid, address)

    def receive_response_forever(self):
        """
        循环接受 udp 数据
        """
        self.logger.info("receive response forever {}:{}".format(
            self.bind_ip, self.bind_port))
        # 首先加入到 DHT 网络
        self.bootstrap()
        while True:
            try:
                # 接受返回报文
                data, address = self.udp.recvfrom(UDP_RECV_BUFFSIZE)
                # 使用 bdecode 解码返回数据
                msg = bencoder.bdecode(data)
                # 处理返回信息
                self.on_message(msg, address)
                time.sleep(SLEEP_TIME)
            except Exception as e:
                self.logger.warning(e)