コード例 #1
0
        sock.close()
        break
    # 等待并接收来自Slave的连接(程序会在这里停住,直到有Slave连接)
    conn, addr = sock.accept()
    conn.settimeout(10)
    try:
        # 接收到来自Slave的请求(程序会在这里停住,直到接收到Slave的消息)
        req = conn.recv(1024).decode("utf-8")
        if req.startswith("get"):
            # 如果Slave发送的消息以"get"开头,则给它发回一个用来爬取的URL
            # 消息的格式:"get,123456"
            # slave_id取得发来消息的Slave的ID
            slave_id = req.split(",")[1]
            task_url = task_list.get_task()
            # 把url发给Slave
            print("向'Slave {0}' 分配爬取 '{1}'".format(slave_id, task_url))
            conn.send(task_url.encode("utf-8"))
        elif req.startswith("done"):
            # 如果Slave发送的消息以"done"开头,说明它是在告诉master它完成了一个任务
            # 消息的格式:"done,123456,https://finance.sina.com.cn/china/gncj/2018-10-17/doc-ifxeuwws5236619.shtml"
            # slave_id取得发来消息的Slave的ID,done_url取得Slave发来的完成爬取的页面的链接
            slave_id = req.split(",")[1]
            done_url = req.split(",")[2]
            # 这里将已爬取的页面完全从任务列表中删除
            print("'Slave {0}' 完成爬取 '{1}'".format(slave_id, done_url))
            task_list.done_task(done_url)
            conn.send("ok".encode("utf-8"))
    except socket.timeout:
        print("套接字连接超时")
    conn.close()
コード例 #2
0
def main():
    addr = "0.0.0.0"
    port = 9992

    main_url = "http://money.163.com/special/00252C1E/gjcj.html"

    task_list = TaskList(timeout=30)
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock.bind((addr, port))
    sock.listen(50)

    #driver = webdriver.Chrome()
    #driver.get(main_url)

    print("正在从网页中解析URL链接...")

    def gethtmltext(url, code="gbk"):
        try:
            r = requests.get(url)
            r.raise_for_status()
            r.encoding = code
            return r.text
        except requests.exceptions.ConnectionError:
            return ""

    html = gethtmltext(main_url)
    try:
        if html == "":
            print("---html error1!---")
        soup = BeautifulSoup(html, 'html.parser')
        url_info = soup.find_all('div', attrs={'class': 'list_item clearfix'})
        news_url = list()
        for i in url_info:
            # noinspection PyBroadException
            try:
                a = i.find(name='h2')
                url = a.find(name='a').attrs['href']
                news_url.append(url)
                print(url)
            except:
                continue
        task_list.put_tasks(news_url)
    except:
        print("---url error2!---")
        # driver.close()

    print("等待client中.......")
    while 1:
        if task_list.is_empty():
            print("====任务完成====")
            sock.close()
            break

        conn, addr = sock.accept()  # 接受TCP连接,并返回新的套接字与IP地址
        print('Connected by\n', addr, conn)  # 输出客户端的IP地址
        try:
            data = conn.recv(1024).decode("gbk")
            if data.split(',')[0] == "get":
                client_id = data.split(',')[1]
                task_url = task_list.get_task()

                print("向client {0} 分配 {1}".format(client_id, task_url))
                conn.send(task_url.encode("gbk"))
            elif data.split(',')[0] == "done":
                client_id = data.split(',')[1]
                client_url = data.split(',')[2]
                print("client {0}' 完成爬取 {1}".format(client_id, client_url))
                task_list.done_task(client_url)
                conn.send("ok".encode("gbk"))
        except socket.timeout:
            print("Timeout!")
        conn.close()  # 关闭连接