if url_cnt != 0: time.sleep(1) driver.find_element_by_xpath(__XPATH_NEXT__).click() # 从已经加载好的网页源码中读取url selector = etree.HTML(driver.page_source) urls = selector.xpath(__XPATH__URL__) url_cnt += len(urls) # 将这一个页面中读取到的url加入任务列表中 task_list.put_tasks(urls) driver.close() # 监听8888端口,等待slave连接并为其分配任务 print("等待Slave节点连接中...") while True: # 若任务列表中的所有任务都已完成,则程序退出 if task_list.is_empty(): print("所有任务已完成") sock.close() break # 等待并接收来自Slave的连接(程序会在这里停住,直到有Slave连接) conn, addr = sock.accept() conn.settimeout(10) try: # 接收到来自Slave的请求(程序会在这里停住,直到接收到Slave的消息) req = conn.recv(1024).decode("utf-8") if req.startswith("get"): # 如果Slave发送的消息以"get"开头,则给它发回一个用来爬取的URL # 消息的格式:"get,123456" # slave_id取得发来消息的Slave的ID slave_id = req.split(",")[1] task_url = task_list.get_task()
def main(): addr = "0.0.0.0" port = 9992 main_url = "http://money.163.com/special/00252C1E/gjcj.html" task_list = TaskList(timeout=30) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.bind((addr, port)) sock.listen(50) #driver = webdriver.Chrome() #driver.get(main_url) print("正在从网页中解析URL链接...") def gethtmltext(url, code="gbk"): try: r = requests.get(url) r.raise_for_status() r.encoding = code return r.text except requests.exceptions.ConnectionError: return "" html = gethtmltext(main_url) try: if html == "": print("---html error1!---") soup = BeautifulSoup(html, 'html.parser') url_info = soup.find_all('div', attrs={'class': 'list_item clearfix'}) news_url = list() for i in url_info: # noinspection PyBroadException try: a = i.find(name='h2') url = a.find(name='a').attrs['href'] news_url.append(url) print(url) except: continue task_list.put_tasks(news_url) except: print("---url error2!---") # driver.close() print("等待client中.......") while 1: if task_list.is_empty(): print("====任务完成====") sock.close() break conn, addr = sock.accept() # 接受TCP连接,并返回新的套接字与IP地址 print('Connected by\n', addr, conn) # 输出客户端的IP地址 try: data = conn.recv(1024).decode("gbk") if data.split(',')[0] == "get": client_id = data.split(',')[1] task_url = task_list.get_task() print("向client {0} 分配 {1}".format(client_id, task_url)) conn.send(task_url.encode("gbk")) elif data.split(',')[0] == "done": client_id = data.split(',')[1] client_url = data.split(',')[2] print("client {0}' 完成爬取 {1}".format(client_id, client_url)) task_list.done_task(client_url) conn.send("ok".encode("gbk")) except socket.timeout: print("Timeout!") conn.close() # 关闭连接