class ReptileLib: ''' 线程库 负责爬取任务 对 halt 和 resume 提供接口 ''' def __init__(self): self.urlist = Urlist() self.queue = Queue() self.in_queue = UrlQueue() def init(self,home_list, reptile_num): ''' 所有动态初始化过程 ''' #新建 queue in_queue list home_num = len(home_list) #线程个数 self.reptile_num = reptile_num self.urlist.init(home_num) self.in_queue.init(home_num) def InitReptiles(self): ''' 所有线程初始化 由ReptileCtrlRcv控制运行 ''' self.reptiles = [] for i in range(self.reptile_num): t = Reptile() self.reptiles.append(t)
def init(self): ''' 取得初始的home_lists 进行爬取 得到各client的ip列表 ''' #取得home_lists #取得urlists self.home_list = [ "http://www.cau.edu.cn", "http://www.sina.com.cn", ] #新建本地url存储队列 #为每一个home_url建立一个队列存储收到的新链接 #由 UrlQueue统一管理 self.queue = UrlQueue(self.home_list) #!!!!!!!!!!!!!!!!此处需要动态配置 self.clientIPs = [ ('127.0.0.1', 80), ]
def __init__(self): self.urlist = Urlist() self.queue = Queue() self.in_queue = UrlQueue()
class CentreServ: ''' 主服务器控制程序 将以主程序方式运行于控制服务器 设立一个单独线程用于GUI的显示和控制 ''' def __init__(self): Server.__init__() self.tcpSerSock = socket(AF_INET, SOCK_STREAM) self.tcpSerSock.bind(ADDR) self.tcpSerSock.listen(5) self.num = 0 #继续运行信号 self.running = True self.signal = Signal() def init(self): ''' 取得初始的home_lists 进行爬取 得到各client的ip列表 ''' #取得home_lists #取得urlists self.home_list = [ "http://www.cau.edu.cn", "http://www.sina.com.cn", ] #新建本地url存储队列 #为每一个home_url建立一个队列存储收到的新链接 #由 UrlQueue统一管理 self.queue = UrlQueue(self.home_list) #!!!!!!!!!!!!!!!!此处需要动态配置 self.clientIPs = [ ('127.0.0.1', 80), ] def run(self): while True: if not self.running: ''' 停止运行 ''' break print 'waiting for connection...' tcpCliSock, addr = self.tcpSerSock.accept() print '...connected from:', addr print 'start a new thread' #start new thread to serve client t = threading.Thread(target=self.getConnection, args=[tcpCliSock]) t.setDaemon(True) t.start() print 'thread is ended' self.tcpSerSock.close() def parseSignal(self, signal): ''' signal 解析 ''' d = pq(signal) _type = d('signal').attr('type') def __get_urls_from_queues(self): ''' 从本平台数据中提取一定量的url 返回格式: { siteID: id urls: [ [title, path], [title, path], [title, path], ] } ''' return self.queue.getUrlList(100) def __get_urls_from_client(self, data): ''' receive urls from a client ''' signal = pq(data) li = signal(url) urls = [] for i in range(len(li)): url = [] u = li.eq(i) url.append(u.attr('siteID')) url.append(u.attr('title')) url.append(u.attr('path')) urls.append(url) def __get_status(self, data): ''' 从客户端取得status信号 ''' signal = pq(data) res = {} res['pages_num'] = signal.attr('pages_num') res['urlist_num'] = signal.attr('urlist_num') res['queue_num'] = signal.attr('queue_num') return res #--------------------------------------------- def __send_init(self, clientsock): ''' 向客户端发送init信号 ''' signal = "<signal type='init'/>" clientsock.send(signal) def __send_halt(self, clientsock): ''' 向客户端发送halt信号 ''' signal = "<signal type='halt'/>" clientsock.send(signal) def __send_stop(self, clientsock): ''' 向客户端发送stop信号 ''' signal = "<signal type='stop'/>" clientsock.send(signal) def __send_resume(self, clientsock): ''' 向客户端发送resume信号 ''' signal = "<signal type='resume'/>" clientsock.send(signal) def __send_urltask(self, clientsock): ''' send more urltask to this client ''' #pop urls from queue urldoc = self.__get_urls_from_queues() siteID = urldoc['siteID'] urls = urldoc['urls'] signal = pq('<signal></signal>') signal('signal').attr('type', 'urltask') signal('siteID').attr('siteID', siteID) for url in urls: u = pq('<url/>') u.attr('title', url[0]) u.attr('path', url[1]) signal.append(u) clientsock.send(signal) #------------------------------------------------------ def frame(self): ''' 显示图形界面 ''' pass def getConnection(self, clientsock): self.num += 1 while True: data = clientsock.recv(4096) if not len(data): print 'received empty data' print 'thread is to end' break else: print data #clientsock.send('[%s] %s'% (ctime(),data)) clientsock.close()