Esempio n. 1
0
class ReptileLib:
    '''
    线程库
    负责爬取任务
    对 halt 和 resume 提供接口
    '''
    def __init__(self):
        self.urlist = Urlist()
        self.queue = Queue()
        self.in_queue = UrlQueue()

    def init(self,home_list, reptile_num):
        '''
        所有动态初始化过程 
        '''
        #新建 queue  in_queue list
        home_num = len(home_list)
        #线程个数
        self.reptile_num = reptile_num
        self.urlist.init(home_num)
        self.in_queue.init(home_num)

    def InitReptiles(self):
        '''
        所有线程初始化
        由ReptileCtrlRcv控制运行
        '''
        self.reptiles = []
        for i in range(self.reptile_num):
            t = Reptile()
            self.reptiles.append(t)
Esempio n. 2
0
 def init(self):
     '''
     取得初始的home_lists    进行爬取
     得到各client的ip列表
     '''
     #取得home_lists
     #取得urlists
     self.home_list = [
         "http://www.cau.edu.cn",
         "http://www.sina.com.cn",
     ]
     #新建本地url存储队列
     #为每一个home_url建立一个队列存储收到的新链接
     #由 UrlQueue统一管理
     self.queue = UrlQueue(self.home_list)
     #!!!!!!!!!!!!!!!!此处需要动态配置
     self.clientIPs = [
         ('127.0.0.1', 80),
     ]
Esempio n. 3
0
 def __init__(self):
     self.urlist = Urlist()
     self.queue = Queue()
     self.in_queue = UrlQueue()
Esempio n. 4
0
class CentreServ:
    '''
    主服务器控制程序
    将以主程序方式运行于控制服务器 
    设立一个单独线程用于GUI的显示和控制
    '''
    def __init__(self):
        Server.__init__()
        self.tcpSerSock = socket(AF_INET, SOCK_STREAM)
        self.tcpSerSock.bind(ADDR)
        self.tcpSerSock.listen(5)
        self.num = 0
        #继续运行信号
        self.running = True
        self.signal = Signal()

    def init(self):
        '''
        取得初始的home_lists    进行爬取
        得到各client的ip列表
        '''
        #取得home_lists
        #取得urlists
        self.home_list = [
            "http://www.cau.edu.cn",
            "http://www.sina.com.cn",
        ]
        #新建本地url存储队列
        #为每一个home_url建立一个队列存储收到的新链接
        #由 UrlQueue统一管理
        self.queue = UrlQueue(self.home_list)
        #!!!!!!!!!!!!!!!!此处需要动态配置
        self.clientIPs = [
            ('127.0.0.1', 80),
        ]

    def run(self):
        while True:
            if not self.running:
                '''
                停止运行
                '''
                break
            print 'waiting for connection...'
            tcpCliSock, addr = self.tcpSerSock.accept()
            print '...connected from:', addr

            print 'start a new thread'
            #start new thread to serve client
            t = threading.Thread(target=self.getConnection, args=[tcpCliSock])
            t.setDaemon(True)
            t.start()
            print 'thread is ended'
        self.tcpSerSock.close()

    def parseSignal(self, signal):
        '''
        signal 解析
        '''
        d = pq(signal)
        _type = d('signal').attr('type')

    def __get_urls_from_queues(self):
        '''
        从本平台数据中提取一定量的url
        返回格式:
        {
            siteID: id
            urls:   [
                        [title, path],
                        [title, path],
                        [title, path],
                    ]
        }
        '''
        return self.queue.getUrlList(100)

    def __get_urls_from_client(self, data):
        '''
        receive urls from a client
         
        '''
        signal = pq(data)
        li = signal(url)
        urls = []
        for i in range(len(li)):
            url = []
            u = li.eq(i)
            url.append(u.attr('siteID'))
            url.append(u.attr('title'))
            url.append(u.attr('path'))
            urls.append(url)

    def __get_status(self, data):
        '''
        从客户端取得status信号
        '''
        signal = pq(data)
        res = {}
        res['pages_num'] = signal.attr('pages_num')
        res['urlist_num'] = signal.attr('urlist_num')
        res['queue_num'] = signal.attr('queue_num')
        return res

    #---------------------------------------------
    def __send_init(self, clientsock):
        '''
        向客户端发送init信号 
        '''
        signal = "<signal type='init'/>"
        clientsock.send(signal)

    def __send_halt(self, clientsock):
        '''
        向客户端发送halt信号 
        '''
        signal = "<signal type='halt'/>"
        clientsock.send(signal)

    def __send_stop(self, clientsock):
        '''
        向客户端发送stop信号 
        '''
        signal = "<signal type='stop'/>"
        clientsock.send(signal)

    def __send_resume(self, clientsock):
        '''
        向客户端发送resume信号 
        '''
        signal = "<signal type='resume'/>"
        clientsock.send(signal)

    def __send_urltask(self, clientsock):
        '''
        send more urltask to this client
        '''
        #pop urls from queue
        urldoc = self.__get_urls_from_queues()
        siteID = urldoc['siteID']
        urls = urldoc['urls']
        signal = pq('<signal></signal>')
        signal('signal').attr('type', 'urltask')
        signal('siteID').attr('siteID', siteID)
        for url in urls:
            u = pq('<url/>')
            u.attr('title', url[0])
            u.attr('path', url[1])
            signal.append(u)
        clientsock.send(signal)


#------------------------------------------------------

    def frame(self):
        '''
        显示图形界面
        '''
        pass

    def getConnection(self, clientsock):
        self.num += 1
        while True:
            data = clientsock.recv(4096)
            if not len(data):
                print 'received empty data'
                print 'thread is to end'
                break
            else:
                print data

                #clientsock.send('[%s] %s'% (ctime(),data))
        clientsock.close()