Beispiel #1
0
def fetchFreeProxy(num=500):
    global availables
    import sys

    del sys.modules['twisted.internet.reactor']
    from twisted.internet import reactor
    from twisted.internet import default

    default.install()

    res = requests.get(
        "http://www.89ip.cn/tqdl.html?api=1&num=%d&port=&address=&isp=" %
        (num))
    proxies = []

    # add new proxy into proxy list
    for proxy in re.findall("([0-9\.:]{10,})", res.content):
        proxies.append('http://' + proxy)

    # merge old available proxy into proxy list
    for item in availables:
        proxies.append(item['proxy'])

    # clean availables
    availables = []

    proxies = list(set(proxies))
    r = requests.session()

    url = "https://www.baidu.com/img/bd_logo1.png?where=super"

    deferred_list = []
    if True:
        for proxy in proxies:
            for i in range(0, 1):
                deferred = checkStatus(url=url, proxy=proxy, timeout=10)
                deferred.addCallback(callback)  # 请求返回后的回调函数
                deferred.addErrback(errback)
                deferred_list.append(deferred)  # 把所有的请求加到列表里,后面要检测
        dlist = defer.DeferredList(deferred_list)  # 检测所有的请求
        dlist.addBoth(lambda _: reactor.stop())  # 检测到所有请求都执行完,执行的方法
        reactor.run()
Beispiel #2
0
from twisted.web.client import getPage, defer
from twisted.internet import reactor


def all_done(arg):
    reactor.stop()  # 结束死循环


def callback(contents):
    print(contents.decode('utf-8'))


deferred_list = []

url_list = [
    'http://www.bing.com',
    'http://www.baidu.com',
]

for url in url_list:
    deferred = getPage(bytes(url, encoding='utf8'))  # 发送请求
    deferred.addCallback(callback)  # 回调函数
    deferred_list.append(deferred)  # 收集结果

dlist = defer.DeferredList(deferred_list)  # 延迟等待
dlist.addBoth(all_done)

reactor.run()  # 死循环, 查看是否响应了
 def start(self):
     dl = defer.DeferredList(self._active)
     dl.addBoth(self._stop_reactor)  # 所有的爬虫终止 调用
     reactor.run()
Beispiel #4
0
 def stop(self):
     return defer.DeferredList([c.stop() for c in list(self.crawlers)])
Beispiel #5
0
 def start(self):
     #开始运行
     dl = defer.DeferredList(self._active)
     dl.addBoth(self._stop_reactor)
     reactor.run()
Beispiel #6
0
def _stop_reactor(_=None):
    reactor.stop()


count = 1
def callback(response):
    global count
    count += 1
    print(len(response))
    if count > 3:
        return None
    for i in range(10):
        yield Request("http://dig.chouti.com/all/hot/recent/%s" % i, callback)


if __name__ == '__main__':

    spider_list = [
        [Request("http://www.baidu.com", callback), ]
    ]

    _active = set()
    for spider in spider_list:
        ret = crawl(spider)
        _active.add(ret)

    dl = defer.DeferredList(_active)
    dl.addBoth(_stop_reactor)

    reactor.run()
Beispiel #7
0
#twisted 是一个网络框架,其中一个功能是发送异步请求,检测IO并自动切换

#twisted 基本用法

from twisted.web.client import getPage, defer
from twisted.internet.reactor import sto


def all_done(arg):
    reactor.stop()


def callback(res):
    print(res)
    return 1


defer_list = []
urls = [
    'https://www.baidu.com',
    'https://www.sina.com.cn',
    'https://www.python.org',
]

for url in urls:
    obj = getPage(url.encode('utf=-8'), )
    obj.addCallback(callback)
    defer_list.append(obj)

defer.DeferredList(defer_list).addBoth(all_done)
reactor.run()
Beispiel #8
0
from twisted.web.client import getPage, defer
from twisted.internet import reactor


def stop_loop(arg):  # 停止循环
    reactor.stop()


def get_response(contents):
    print(contents)


deferred_list = []

url_list = [
    'http://www.baidu.com/',
    'https://www.cnblogs.com/',
    'https://www.cnblogs.com/news/',
    'https://www.cn.bing.com/',
    'https://stackoverflow.com/',
]

for url in url_list:
    deferred = getPage(bytes(url, encoding='utf8'))
    deferred.addCallback(get_response)
    deferred_list.append(deferred)

dlist = defer.DeferredList(deferred_list)  # 监听是否完成了 完成了就执行下面
dlist.addBoth(stop_loop)

reactor.run()