Esempio n. 1
0
本代码使用 I/O多路复用 中的select完成http请求 单线程
*事件循环使用场景
    tornado
    twisted  (scrapy、django channels)
    gevent
    asyncio
"""
import socket
from urllib.parse import urlparse
# selectors是对select库的进一步封装
from selectors import DefaultSelector, EVENT_READ, EVENT_WRITE

# import select
# select.select()

selector = DefaultSelector()  # win用poll ,linux用epoll
urls = []
stop = False


class Fetcher:
    def connected(self, key):
        selector.unregister(key.fd)  # fd == self.client.fileno()
        self.client.send(
            "GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(
                self.path, self.host).encode("utf8"))
        # 注册一个事件: 先监听等待,当当前socket变为可写EVENT_READ时候,调用self.readable方法
        # (里边有recv data方法)里边的逻辑
        selector.register(self.client.fileno(), EVENT_READ, self.readable)

    def readable(self, key):
 def __init__(self):
     self.selector = DefaultSelector()
     self.clients = {}
class Crawler:
    select = DefaultSelector()
    finished = False

    def __init__(self, url):
        self.client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.client.setblocking(False)
        self.url = url
        self.response = b''
        global urls

    # getUrl 这个委派生成器中创建的了3个yield from 通道用于调用方和3个子生成器的通信。
    # 但是这3个通道不是同时建立的,而是先建立一个完了再建立下一个,同一时刻只有一个通道存在。也就是说同一时刻调用方只和1个子生成器通信(同一时刻调用方只运行1个协程)
    def getUrl(self):
        self.__parseUrl()

        # 先暂停执行连接
        yield from self.__connect()

        # 连接完成后暂停执行发请求
        yield from self.__sendReq()

        # 发送请求后执行接收响应
        yield from self.__recvResponse()

    def __connect(self):
        try:
            self.client.connect((self.host, 80))
        except:
            pass

        f = Future()

        self.select.register(self.client, EVENT_WRITE, data=f)

        yield f

    def __sendReq(self):
        self.select.unregister(self.client)

        self.client.send(self.send_msg.encode('utf-8'))

        f = Future()

        self.select.register(self.client, EVENT_READ, data=f)

        yield f

    def __recvResponse(self):
        while True:
            self.select.unregister(self.client)
            data = self.client.recv(1024)

            if data:
                self.response += data
                f = Future()
                self.select.register(self.client, EVENT_READ, data=f)
                yield f
            else:
                self.client.close()

                urls.remove(self.url)

                self.__class__.finished = False if len(urls) else True

                self.__saveHtml()

                break

    def __parseUrl(self):
        url_component = urlparse(self.url)
        self.host = url_component.netloc
        self.path = '/' if url_component.path == '' else url_component.path
        self.send_msg = "GET %s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n" % (
            self.path, self.host)

    def __saveHtml(self):
        try:
            dir_path = './crawled_page2/'
            fname = 'index.html' if self.path == '/' else self.path.strip(
                '/').strip('.html') + '.html'
            content_arr = self.response.decode('utf-8').split("\r\n\r\n")
            content_arr[0] = ''
            content = ''.join(content_arr)

            if not os.path.isdir(dir_path):
                os.mkdir(dir_path)

            with open(dir_path + fname, 'w', encoding='utf-8') as f:
                f.write(content)
            print("%s 爬取成功" % str(fname))
        except BaseException as e:
            print(e)

        finally:
            del self

    @classmethod
    def loopEvents(cls):
        while not cls.finished:
            events = cls.select.select()
            for key, mask in events:
                f = key.data
                f.runCallback()
Esempio n. 4
0
def iter_lines(proc, retcode = 0, timeout = None, linesize = -1):
    """Runs the given process (equivalent to run_proc()) and yields a tuples of (out, err) line pairs.
    If the exit code of the process does not match the expected one, :class:`ProcessExecutionError
    <plumbum.commands.ProcessExecutionError>` is raised.

    :param retcode: The expected return code of this process (defaults to 0).
                    In order to disable exit-code validation, pass ``None``. It may also
                    be a tuple (or any iterable) of expected exit codes.

    :param timeout: The maximal amount of time (in seconds) to allow the process to run.
                    ``None`` means no timeout is imposed; otherwise, if the process hasn't
                    terminated after that many seconds, the process will be forcefully
                    terminated an exception will be raised

    :param linesize: Maximum number of characters to read from stdout/stderr at each iteration.
                    ``-1`` (default) reads until a b'\\n' is encountered.

    :returns: An iterator of (out, err) line tuples.
    """

    encoding = getattr(proc, "encoding", None)
    if encoding:
        read_stream = lambda s: s.readline(linesize).decode(encoding).rstrip()
    else:
        read_stream = lambda s: s.readline(linesize)

    _register_proc_timeout(proc, timeout)

    try:
        from selectors import DefaultSelector, EVENT_READ
    except ImportError:
        # Pre Python 3.4 implementation
        def _iter_lines():
            from select import select
            while True:
                rlist, _, _ = select([proc.stdout, proc.stderr], [], [])
                for stream in rlist:
                    yield (stream is proc.stderr), read_stream(stream)
                if proc.poll() is not None:
                    break
    else:
        # Python 3.4 implementation
        sel = DefaultSelector()

        sel.register(proc.stdout, EVENT_READ, 0)
        sel.register(proc.stderr, EVENT_READ, 1)
        def _iter_lines():
            while True:
                for key, mask in sel.select():
                    yield key.data, read_stream(key.fileobj)
                if proc.poll() is not None:
                    break

    buffers = [StringIO(), StringIO()]
    for t, line in _iter_lines():
        ret = [None, None]
        ret[t] = line
        buffers[t].write(line + "\n")
        yield ret

    # this will take care of checking return code and timeouts
    _check_process(proc, retcode, timeout, *(s.getvalue() for s in buffers))
Esempio n. 5
0
 def __init__(self):
     self._tasks = deque()
     self._sleeping = []
     self._stop = False
     self._selector = DefaultSelector()
Esempio n. 6
0
# coding=utf-8

import socket
from selectors import DefaultSelector,EVENT_READ,EVENT_WRITE

epoller = DefaultSelector()
stopped = False
urls_todo = {'/'+str(i) for i in range(10)}

# 未来对象,result存放未来的执行结果
class Future:
    # 初始化结果和回调函数为空
    def __init__(self):
        self.result = None
        self._callbacks = []

    # 添加回调函数
    def add_done_callback(self,fn):
        self._callbacks.append(fn)

    # 设置结果并执行回调
    def set_result(self,result):
        self.result = result
        for fn in self._callbacks:
            fn(self)

# 爬虫类
class Crawler:
    def __init__(self,url):
        self.url = url
        self.sock = None
Esempio n. 7
0
# @Email   : [email protected]
# @File    : 1.py
# @Software: PyCharm

'''
 yield from的双向通道功能。关键字yield from在gen()内部为subgen()和main()开辟了通信通道。main()里可以直接将数据1发送给subgen()

 ,subgen()也可以将计算后的数据2返回到main()里,main()里也可以直接向subgen()抛入异常以终止subgen()。

 顺带一提,yield from 除了可以 yield from <generator> 还可以 yield from <iterable>。
 '''

import socket
from selectors import DefaultSelector, EVENT_WRITE, EVENT_READ

selector = DefaultSelector()
stopped = False
urls_todo = {'/', '/1', '/2', '/3', '/4', '/5', '/6', '/7', '/8', '/9'}


class Future:
    def __init__(self):
        self.result = None
        self._callbacks = []

    def add_done_callback(self, fn):
        self._callbacks.append(fn)

    def set_result(self, result):
        self.result = result
        for fn in self._callbacks:
Esempio n. 8
0
import socket
from typing import Callable, Generator
from selectors import DefaultSelector, EVENT_WRITE, EVENT_READ

selector = DefaultSelector()  # 创建选择器对象

stoped = False


class Future:
    def __init__(self):
        self.result = None
        self._callbacks = []

    def add_done_callback(self, fn: Callable) -> None:
        self._callbacks.append(fn)

    def set_result(self, result) -> None:
        self.result = result
        for fn in self._callbacks:
            fn(self)

    # Future 类的方法
    def __iter__(self):
        # 告诉 Task 在这里继续
        yield self
        return self.result


class Task:
    def __init__(self, coro):
Esempio n. 9
0
    def __init__(self):
        self.co_result = {}

        self.selector = DefaultSelector()
Esempio n. 10
0
class Crawler:
    select = DefaultSelector(
    )  # 定义一个selector对象存在类变量中, 目前在windows环境,所以自动选择select多路复用器
    finished = False  # 爬取是否结束,该变量用于控制停止事件循环监听,如果爬取完所有url则停止loopEvents()的循环(停止监听事件)

    # urls是要爬取的url列表
    def __init__(self, url):
        self.client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.client.setblocking(False)  # 设置为非阻塞状态
        self.url = url
        self.response = b''  # 用于累积保存本次请求接收到的所有数据
        global urls

    # 爬取单个页面, 这是个协程方法
    def getUrl(self):
        # 解析url,构建请求报文
        self.__parseUrl()

        # 连接服务器,这里必须用try
        try:
            self.client.connect((self.host, 80))
        except:
            pass

        # 连接是一个未完成的事件,所以要创建一个Future对象
        f = Future()

        # 注册监听写事件,对于客户端而言,与服务器建立好连接后可视为写就绪,写就绪后就可以发送请求报文
        # self.select.register(self.client, EVENT_WRITE, data=self.__sendReq)  # 这是回调版本的注册事件代码,下面是协程版本的注册事件代码
        self.select.register(
            self.client, EVENT_WRITE,
            data=f)  # 这里我把事件的回调设置为f对象,函数引用是对象,f也是一个对象,所以这里是可以这样做的

        # 暂停协程,直到事件就绪,f对象中的Task对象的step方法被调用才会恢复协程
        # 必须先register注册事件,才能yield f暂停协程。如果先暂停就会永远触发不了事件的回调,因为事件都没注册怎么可能监控得到事件呢,监控不到事件怎么去触发它的回调step呢,这样的结果就是协程一直处于暂停状态
        yield f  # 产出Future给Task,Task会存放一个step回调给f对象

        self.select.unregister(
            self.client
        )  # 这里有一个注意点,select.unregister解除监听写事件必须在client.send之前,否则会在下面的while循环中报错说socket连接关闭(原因是写事件就绪协程恢复运行去recv接收响应,但是其实读事件没有就绪,所以recv到了空字节,所以执行了后面的client.close()),这里花了我一点时间排雷

        # 运行到这里,说明协程被恢复,写事件就绪,可以发送请求给服务器了
        self.client.send(self.send_msg.encode('utf-8'))  # 发送请求报文

        # 发送请求又是一个还未完成的事件(读事件未就绪),此时又需要创建一个新的Future对象,一个Future对象对应一个事件嘛
        # f = Future()    # 这里的创建f对象包含在了下面的while循环中,所以就把这条冗余代码注释掉

        # 发送请求报文后,就要接收响应,不过要等到读事件就绪(就是说等到服务端把响应发过来,client的内核缓冲区有数据可读的时候)
        # 所以我们可以监听读事件是否就绪(用modify更改监听写事件为监听读事件)
        # self.select.modify(self.client, EVENT_READ, data=f)

        while True:  # 循环读取,响应报文内容可能很多,一次recv无法读取完,每次recv前都要执行一次yield暂停协程等待读事件就绪
            f = Future()
            self.select.register(
                self.client, EVENT_READ, data=f
            )  # 为什么每循环一次都要重新注册一次读事件?因为每循环一次都需要往register中传入一个新的Future对象
            yield f  # 暂停协程

            # 协程被恢复,说明客户端读就绪,可以解除上一次监听的读事件并开始接收响应
            self.select.unregister(self.client)
            data = self.client.recv(1024)
            if data:
                self.response += data
            else:  # 数据接收完毕
                self.client.close()  # 关闭连接

                # 获取完url的内容后,删除self.urls中的该url
                urls.remove(self.url)

                self.__class__.finished = False if len(urls) else True

                self.__saveHtml()

                break  # 跳出循环,终止协程

    # 根据url解析主机和爬取的路径,已经构建请求报文
    def __parseUrl(self):
        url_component = urlparse(self.url)
        self.host = url_component.netloc  # url的主机名
        self.path = '/' if url_component.path == '' else url_component.path
        self.send_msg = "GET %s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n\r\n" % (
            self.path, self.host)

    # 保存页面到文件
    def __saveHtml(self):
        try:
            dir_path = './crawled_page/'
            fname = 'index.html' if self.path == '/' else self.path.strip(
                '/').strip('.html') + '.html'
            content_arr = self.response.decode('utf-8').split(
                "\r\n\r\n")  # 第一个元素是响应头,应该去掉,只留响应体的内容
            content_arr[0] = ''
            content = ''.join(content_arr)

            if not os.path.isdir(dir_path):
                os.mkdir(dir_path)

            with open(dir_path + fname, 'w', encoding='utf-8') as f:
                f.write(content)
            print("%s 爬取成功" % str(fname))
        except BaseException as e:
            print(e)

        finally:  # 最后无论如何都要销毁自身这个 Crawler 类,不然每请求一个url就创建一个crawler类又不销毁,浪费内存
            del self

    # 循环监听事件(阻塞),在这个类方法中,多路复用器会开始监听所有客户端socket的事件状态;事件就绪后回调函数也是在这个方法里面调用的
    # 回调函数step被调用会恢复协程的运行
    @classmethod
    def loopEvents(cls):
        while not cls.finished:
            events = cls.select.select()  # 监听所有socket的事件,该过程是阻塞的;返回一个包含多个元组的列表
            for key, mask in events:  # key是selectorKey对象,包含该事件的回调函数(data属性,在这里data属性放着的是Future对象),mask是事件的类型,是一个整型
                f = key.data  # 调用register监听client事件时传入的Future对象,一个register的事件对应一个Future
                f.runCallback()  # callback做的事情是恢复协程的运行
Esempio n. 11
0
import socket
from selectors import DefaultSelector, EVENT_READ, EVENT_WRITE

selecotr = DefaultSelector()


class Fetcher:
    def connected(self, key):
        selecotr.unregister(key.fd)
        self.client.send("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format("/",self.host).encode("utf8"))
        selecotr.register(self.client.fileno(), EVENT_READ, self.readable)

    def readable(self, key):
        d = self.client.recv(1024)
        if d:
            self.data += d
        else:
            selecotr.unregister(key.fd)
            data = self.data.decode("utf8")
            print(data)

    def get_url(self, url):
        self.client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.client.setblocking(False)
        self.data = b""
        self.host = "www.baidu.com"

        try:
            self.client.connect((self.host, 80))
        except BlockingIOError as e:
            pass
Esempio n. 12
0
def distribute(cmd, max_bytes, max_procs, chunk_size, round_robin, verbose):
    """
    Blocking function that manages all the delegation of chunks from
    stdin to subprocesses, spawning of subprocess, and collation of
    stdout and stderr from each subprocess.

    Broadly speaking, gatling reads chunks of data from stdin and
    disperses those among subprocesses to achieve parallel execution.

    Stdin is read in chunks of chunk_size bytes and truncated to the
    last newline, keeping the remainder to be prepended on the next
    chunk. Multiple chunks without newline is allowed, nothing is
    passed on until a newline is found or stdin closes.

    The stdout and stderr from each child is read in a similar
    fashion. Whenever a newline is found the output is written onto
    stdout or stderr, respectively. This collation preserves the
    format of the output, but only weakly adheres to the chronology.
    The output from gatling is the result of the subprocesses with a
    line-by-line integrity preserved with no guarantee that the order
    is exactly maintained.

    There are two different behaviors for subprocess spawning,
    manifold or gatling:

    * In manifold-mode each new chunk read from stdin spawns a new
      subprocess until max_procs is reached and then each of the
      subprocesses are fed a chunk in round robin fashion. This is an
      excellent model for programs that do not tax an external
      resource and programs that can act on stdin as soon as it is
      available.

    * In gatling-mode chunks are fed to a single subprocess until that
      processes' max_bytes is reached, the subprocess' stdin is closed
      and on the next chunk a new subprocess is spawned and fed until
      its max_bytes is reached and so on until max_procs is reached.
      If max_procs is reached gatling is blocked until a subprocess
      finishes. This mode works well for programs that connect to an
      external service or programs that don't start processing stdin
      until it is closed.

    cmd (list): The full command line for spawning subprocesses.

    max_bytes (int): Maximum bytes to pass to each subprocess before
        closing the subprocess' stdin. Increase this value if
        subprocesses do not have memory management problems from large
        input and if subprocesses process stdin as stream. Decrease
        this value if programs can not handle a large input set on
        stdin or if programs do not start processing until stdin is
        closed.

    max_procs (int): Maximum number of simultaneos subprocesses.
        Increase this number if the subprocess is largely CPU bound,
        decrease it to match the hardware if the subprocesses are IO
        bound.

    chunk_size (int): Stdin content streamed to gatling is consumed in
        chunks of this size (bytes) for efficiency reasons. Originally
        it was line by line, but that was too slow to keep
        subprocesses fed continuously. To force line-by-line behavior,
        set chunk to a size always less than the length of an input
        line (worst case 1, but try to keep it as high as possible).
        Experiment with this number to maximize pipeline throughput.

    round_robin (bool): True is manifold, false is gatling, see above
        for details.
    """

    from sys import stdin, stdout, stderr
    from subprocess import Popen, PIPE
    from time import sleep, time
    from selectors import DefaultSelector, EVENT_READ
    from threading import Thread
    from fcntl import fcntl, F_SETFL, F_GETFL
    from os import O_NONBLOCK

    def sink(selector, not_done):
        n = 0
        while len(not_done) > 1 or list(selector.get_map()):
            for (fileobj, _, _, p), _ in selector.select():
                chunk = fileobj.read(chunk_size)
                if chunk:
                    i = chunk.rfind(b'\n') + 1
                    if i:
                        n += fileobj._trg.write(fileobj._buf)
                        n += fileobj._trg.write(chunk[:i])
                        fileobj._buf = chunk[i:]
                    else:
                        fileobj._buf += chunk
                else:
                    if p.returncode is not None:
                        n += fileobj._trg.write(fileobj._buf)
                        selector.unregister(fileobj)
            not_done[0] = n

    my_name = 'manifold' if round_robin else 'gatling'

    selector = DefaultSelector()
    res = []
    p_filled = []  # Child processes that have had their maximum input supplied
    # (p_open) Child processes that can take more input
    # In non-round-robin mode, there will only be one such child,
    # which will continually be popped and re-appended.
    # In round-robin mode, this list is rotated each time there is
    # a buffer to be written to a child.
    p_open = []
    b_in = 0
    buf = b''
    not_done = [0, 1]
    sel_t = None
    t0 = time()
    try:
        for chunk in iter(lambda: stdin.buffer.read(chunk_size), b''):
            b_in += len(chunk)
            i = chunk.rfind(b'\n') + 1
            if i:
                p = None
                if round_robin:
                    if len(p_open) + len(p_filled) == max_procs:
                        p = p_open.pop(0)
                else:
                    if p_open:
                        p = p_open.pop(0)
                if not p:
                    if verbose:
                        running = len(p_filled) + len(p_open)
                        print(
                            f"# {my_name} STARTED A PROCESS (1 + {running} + {len(res)}):",
                            *cmd,
                            file=sys.stderr)
                    p = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
                    p._n = 0
                    p._t0 = time()
                    for fo, trg in [(p.stdout, stdout.buffer),
                                    (p.stderr, stderr.buffer)]:
                        fo._buf = b''
                        fo._trg = trg
                        fcntl(fo, F_SETFL, fcntl(fo, F_GETFL) | O_NONBLOCK)
                        selector.register(fo, EVENT_READ, p)
                    if not sel_t:
                        sel_t = Thread(target=sink, args=(selector, not_done))
                        sel_t.daemon = True
                        sel_t.start()

                p._n += p.stdin.write(buf)
                p._n += p.stdin.write(chunk[:i])
                buf = chunk[i:]
                if p._n >= max_bytes:
                    t1 = time()
                    p._t1 = t1
                    td = (t1 - t0) * 1024
                    ptd = (t1 - p._t0) * 1024
                    p.stdin.close()
                    p_filled.append(p)
                    if verbose and ptd and td:
                        running = len(p_filled) + len(p_open)
                        print(
                            f"# {my_name} PROCESS LIMIT {p._n:,}/{max_bytes:,}",
                            f"({p._n/ptd:.2f} kb/s).",
                            f"INPUT: {b_in:,} ({b_in/td:.2f} kb/s) OUTPUT: {not_done[0]:,}.",
                            f"PROCESSES: {running}/{running+len(res)}",
                            file=sys.stderr)
                    while len(p_filled) == max_procs:
                        done = [d for d in p_filled if d.poll() is not None]
                        if done and verbose:
                            print(f"# {my_name} CLOSED {len(done)} PROCESSES",
                                  file=sys.stderr)
                        for d in done:
                            if verbose and p._t0 != t1:
                                print(
                                    f"# {my_name} CLOSED PROCESS INPUT: {p._n:,} TIME:",
                                    f"{t1-p._t0:.1f}/{t1-p._t1:.1f}",
                                    f"KB/S: {p._n/(t1-p._t0)/1024:.2f}",
                                    file=sys.stderr)
                            p_filled.remove(d)
                            res.append(d.returncode)
                        if not done:
                            sleep(0.5)
                else:
                    p_open.append(p)
            else:
                buf += chunk
        for p in p_open:
            p.stdin.write(buf)
            buf = b''
            p.stdin.close()
            p_filled.append(p)
    except (KeyboardInterrupt, SystemExit):
        not_done.pop()
        for p in p_open:
            p.stdin.close()
            p.kill()
        for d in p_filled:
            d.kill()
        raise

    if sys.platform == 'linux':
        # This method, for unknown reasons, leads to busy wait and an explosion in
        # CPU consumption on the mac
        while p_filled:
            res.append(p_filled.pop(0).wait())
    else:
        # This method, for unkown reasons, works on the mac and hangs on linux
        while p_filled:
            sleep(0.5)
            tmp = [p.poll() for p in p_filled]
            for i, rt in reversed(list(enumerate(tmp))):
                if rt is not None:
                    res.append(p_filled.pop(i).wait())

    if sel_t:
        not_done.pop()
        sel_t.join()
    selector.close()
    return res
Esempio n. 13
0
#!/usr/bin/python3 -u
from selectors import DefaultSelector, EVENT_READ
from signal import signal, SIGTERM
from socket import socket, AF_INET, SOCK_DGRAM
from sys import argv
from time import time

from constants import UDP_MTU
from util import b2i

socket_selector = DefaultSelector()

output_file = argv[1]

for port_number in argv[2:]:
    listener_socket = socket(AF_INET, SOCK_DGRAM)
    listener_socket.bind(("127.0.0.1", int(port_number)))

    socket_selector.register(listener_socket, EVENT_READ)


def set_killed(signal, stackframe):
    global killed

    killed = True


signal(SIGTERM, set_killed)

killed = False
Esempio n. 14
0
        path = parsed_url.path if parsed_url.path else '/'
        path_with_query = '{}?{}'.format(
            path, parsed_url.query) if parsed_url.query else path
        await send(
            sock,
            'GET {} HTTP /1.1\r\nHost:{}\r\nConnection:Close\r\n\r\n'.format(
                path_with_query, parsed_url.netloc).encode())
        content = await recv(sock)
        print('{}:{}'.format(url, content))


if __name__ == '__main__':
    urls = ['http://www.baidu.com/s?wd={}'.format(i) for i in range(10)]
    tasks = [fetch_url(url) for url in urls]  # 任务队列

    with DefaultSelector() as selector:
        while tasks or selector.get_map():
            events = selector.select(
                0 if tasks else 1)  # 如果有要做的任务,立刻获得当前已就绪的IO事件,否则最多等1秒
            for key, even in events:
                task = key.data
                tasks.append(task)
                selector.unregister(key.fileobj)
            for task in tasks:
                try:
                    fileobj, event = task.send(None)
                except StopIteration:
                    pass
                else:
                    selector.register(fileobj, event, task)
            tasks.clear()
Esempio n. 15
0
 def __init__(self):
     self.address = ('127.0.0.1', 8080)
     self.selector = DefaultSelector()
     self.decoder = DecoderHandler()
     self.server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
Esempio n. 16
0
 def selector():
     sel = DefaultSelector()
     sel.register(proc.stdout.channel, EVENT_READ)
     while True:
         for key, mask in sel.select():
             yield
Esempio n. 17
0
#1. epoll并不代表一定比select好
# 在并发高的情况下,连接活跃度不是很高, epoll比select
# 并发性不高,同时连接很活跃, select比epoll好

#通过非阻塞io实现http请求
# select + 回调 + 事件循环
#  并发性高
# 虽然使用单线程,但是省去了I/O等待时线程切换的开销,因为几十个线程切换的开销是很大的,但是成千上万个回调却是很快。这这种方式是高并发编程的核心。

import socket
from urllib.parse import urlparse
from selectors import DefaultSelector, EVENT_READ, EVENT_WRITE   # DefaultSelector是选择select、poll还是epoll根据平台自己选择。


selector = DefaultSelector() # 全局变量
#使用select完成http请求
urls = []
stop = False


class Fetcher:
    def connected(self, key):
        selector.unregister(key.fd)  # 注销掉监控的描述符

        print("=========key.fd:", key.fd)
        print("=========self.client.fileno():", self.client.fileno())

        self.client.send("GET {} HTTP/1.1\r\nHost:{}\r\nConnection:close\r\n\r\n".format(self.path, self.host).encode("utf8"))
        selector.register(self.client.fileno(), EVENT_READ, self.readable)  # 注册读取监控

    # 全都是cpu操作,速度远远快于I/O操作,尤其是网络I/O
Esempio n. 18
0
import socket
from selectors import DefaultSelector, EVENT_READ, EVENT_WRITE

selector = DefaultSelector()  # 该模块是对底层select/poll/epoll,会根据os自动选择最优的模块
stopped = False
urls_todo = {'/', '/1', '/2', '/3', '/4', '/5', '/6', '/7', '/8', '/9'}


class Crawler:
    def __init__(self, url):
        self.url = url
        self.sock = None
        self.response = b''

    def fetch(self):
        self.sock = socket.socket()
        self.sock.setblocking(False)
        try:
            self.sock.connect(('zh.moegirl.org', 80))
        except BlockingIOError:
            pass
        # EVENT_WRITE值为2
        selector.register(self.sock.fileno(), EVENT_WRITE, self.connected)

    def connected(self, key, mask):
        selector.unregister(key.fd)
        get = 'GET {0} HTTP/1.0\r\nHost: zh.moegirl.org\r\n\r\n'.format(
            self.url)
        self.sock.send(get.encode('utf-8'))
        # EVENT_READ值为1
        selector.register(key.fd, EVENT_READ, self.read_response)
#!/usr/env/bin python
import socket
import re

from selectors import DefaultSelector, EVENT_WRITE, EVENT_READ

try:
    from urlparse import urlparse
except ImportError:
    from urllib.parse import urlparse

selector = DefaultSelector()  # Linux system will use epoll()


class Task():
    def __init__(self, gen):
        self.gen = gen
        self.step()

    def step(self):
        try:
            f = next(self.gen)
        except StopIteration:
            return
        f.add_done_callback(self.step)


class Future():
    def __init__(self):
        self.callback = None
Esempio n. 20
0
"""
"""
### 未来对象(Future)

不使用回调的方式,如何获知异步调用的结果?可以先设计一个对象,异步调用完成后就把结果放入其中,这种对象叫做Future对象。
未来对象有一个result属性,用于存放未来的执行结果。还有一个set_result()方法,用以设置result,并且会在给定result后运行事先给
future添加的回调。回调通过add_done_callback()方法添加。
此处回调使用不同于之前回调。
"""

import socket
from selectors import DefaultSelector, EVENT_READ, EVENT_WRITE
import asyncio

selectors = DefaultSelector()
stopped = False
urls_todo = ['www.163.com']


class Future:
    def __init__(self):
        self.result = None
        self._callbacks = []

    def add_done_callback(self, fn):
        print('add call back')
        self._callbacks.append(fn)

    def set_result(self, result):
        print('set result')
Esempio n. 21
0
 def __init__(self):
     self.ready = deque()
     self.selector = DefaultSelector()
Esempio n. 22
0
 def __init__(self):
     self._sel = DefaultSelector()
     self._fds = {}