コード例 #1
0
 if task_list.is_empty():
     print("所有任务已完成")
     sock.close()
     break
 # 等待并接收来自Slave的连接(程序会在这里停住,直到有Slave连接)
 conn, addr = sock.accept()
 conn.settimeout(10)
 try:
     # 接收到来自Slave的请求(程序会在这里停住,直到接收到Slave的消息)
     req = conn.recv(1024).decode("utf-8")
     if req.startswith("get"):
         # 如果Slave发送的消息以"get"开头,则给它发回一个用来爬取的URL
         # 消息的格式:"get,123456"
         # slave_id取得发来消息的Slave的ID
         slave_id = req.split(",")[1]
         task_url = task_list.get_task()
         # 把url发给Slave
         print("向'Slave {0}' 分配爬取 '{1}'".format(slave_id, task_url))
         conn.send(task_url.encode("utf-8"))
     elif req.startswith("done"):
         # 如果Slave发送的消息以"done"开头,说明它是在告诉master它完成了一个任务
         # 消息的格式:"done,123456,https://finance.sina.com.cn/china/gncj/2018-10-17/doc-ifxeuwws5236619.shtml"
         # slave_id取得发来消息的Slave的ID,done_url取得Slave发来的完成爬取的页面的链接
         slave_id = req.split(",")[1]
         done_url = req.split(",")[2]
         # 这里将已爬取的页面完全从任务列表中删除
         print("'Slave {0}' 完成爬取 '{1}'".format(slave_id, done_url))
         task_list.done_task(done_url)
         conn.send("ok".encode("utf-8"))
 except socket.timeout:
     print("套接字连接超时")
コード例 #2
0
def main():
    addr = "0.0.0.0"
    port = 9992

    main_url = "http://money.163.com/special/00252C1E/gjcj.html"

    task_list = TaskList(timeout=30)
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock.bind((addr, port))
    sock.listen(50)

    #driver = webdriver.Chrome()
    #driver.get(main_url)

    print("正在从网页中解析URL链接...")

    def gethtmltext(url, code="gbk"):
        try:
            r = requests.get(url)
            r.raise_for_status()
            r.encoding = code
            return r.text
        except requests.exceptions.ConnectionError:
            return ""

    html = gethtmltext(main_url)
    try:
        if html == "":
            print("---html error1!---")
        soup = BeautifulSoup(html, 'html.parser')
        url_info = soup.find_all('div', attrs={'class': 'list_item clearfix'})
        news_url = list()
        for i in url_info:
            # noinspection PyBroadException
            try:
                a = i.find(name='h2')
                url = a.find(name='a').attrs['href']
                news_url.append(url)
                print(url)
            except:
                continue
        task_list.put_tasks(news_url)
    except:
        print("---url error2!---")
        # driver.close()

    print("等待client中.......")
    while 1:
        if task_list.is_empty():
            print("====任务完成====")
            sock.close()
            break

        conn, addr = sock.accept()  # 接受TCP连接,并返回新的套接字与IP地址
        print('Connected by\n', addr, conn)  # 输出客户端的IP地址
        try:
            data = conn.recv(1024).decode("gbk")
            if data.split(',')[0] == "get":
                client_id = data.split(',')[1]
                task_url = task_list.get_task()

                print("向client {0} 分配 {1}".format(client_id, task_url))
                conn.send(task_url.encode("gbk"))
            elif data.split(',')[0] == "done":
                client_id = data.split(',')[1]
                client_url = data.split(',')[2]
                print("client {0}' 完成爬取 {1}".format(client_id, client_url))
                task_list.done_task(client_url)
                conn.send("ok".encode("gbk"))
        except socket.timeout:
            print("Timeout!")
        conn.close()  # 关闭连接