class ReptileLib(threading.Thread): ''' 爬虫线程库 ''' def __init__(self): ''' 全局数据控制 ''' self.htmldb = HtmlDB() threading.Thread.__init__(self, name = "reptilelib" ) print "... init ReptileLib ..." #信号队列 由人机界面控制程序运行 self.inSignalQueue = Q.Queue() self.outSignalQueue = Q.Queue() self.Flock = threading.RLock() #控制reptile线程是否运行 self.continueRun = [True] #控制reptilelib 主程序及服务器是否运行 是否完全关闭 self.reptileLibRun = [True] self.curSiteID = [0] #urlQueue and init in lib self.urlQueue = UrlQueue() self.urlist = Urlist() #为了列表的共享性 初始的数据初始化[] 之后不能随意改变 self.homeUrls = [] self.pages = [] self.maxPages = [] self.reptilectrl = ReptileCtrl( homeUrls = self.homeUrls, continueRun = self.continueRun, urlist = self.urlist, urlQueue = self.urlQueue, maxPages = self.maxPages, pages = self.pages, outSignalQueue = self.outSignalQueue, ) self.controlserver = ControlServer(self.inSignalQueue, self.outSignalQueue) #run init thread self.runInit() def runInit(self): ''' run init thread ''' self.controlserver.start() self.start() def run(self): ''' 运行主程序 signal: { type:type } ''' print "... run while ..." while True: print '.. while ReptileLib running ..' signal = self.inSignalQueue.get() print 'get signal', signal _type = signal['type'] print 'get type', _type if _type is 'init': ''' 全新运行 ''' print '.. init from empty project ..' self.init( homeUrls = signal['homeurls'] , maxPages = signal['maxpages'] , threadNum = signal['reptilenum'] ) elif _type is 'resume': print '.. resume from database ..' self.reptilectrl.resume() elif _type is 'stop': print '.. stop ..' self.reptilectrl.stop() elif _type is 'halt': print '.. halt ..' self.reptilectrl.halt() elif _type is 'status': ''' ask for status ''' print '.. status ..' #put status in queue self.reptilectrl.status() elif _type is 'start': ''' run reptiles ''' print '.. run reptile threads ..' print 'It works!' self.continueRun[0] = True self.initThreads() self.threadsRun() print 'ReptileLib core stopped!' print 'Reptile stopped' def init(self, homeUrls, maxPages, threadNum): ''' 完全初始化 首次运行 注意: 重复init时,为了list的共享数据特性 每次需要清空[] 然后再重新赋值 ''' def clearList(_List): if not _List: return _size = len(_List) for i in range(_size): _List.pop() def initList(_List, List): #first clear list clearList(_List) for l in List: _List.append(l) initList(self.homeUrls ,homeUrls) initList(self.maxPages, maxPages) self.threadNum = threadNum self.maxPages = maxPages #self.htmldb = HtmlDB(self.htmlparser) #init self.pages #self.pages used to calculate num of pages downloaded clearList(self.pages) for i in range(len(homeUrls)): self.pages.append(0) #init urlQueue self.urlQueue.init(self.homeUrls) self.urlQueue.initFrontPage() self.urlist.init(len(self.homeUrls)) #存储 homeUrls self.htmldb.saveHomeUrls(homeUrls, maxPages, self.pages) def initThreads(self): self.thlist = [] #default: from site 0 self.curSiteID[0] = 0 for i in range(self.threadNum): #此处前缀也需要变化 #修改 根据站点前缀命名爬虫 th = Reptile( name = "reptile%d"%i, urlQueue = self.urlQueue, urlist = self.urlist, Flock = self.Flock, homeUrls = self.homeUrls, maxPageNums = self.maxPages, pages = self.pages, curSiteID = self.curSiteID, continueRun = self.continueRun ) self.thlist.append(th) def threadsRun(self): for th in self.thlist: th.start()
class ReptileLib(threading.Thread): """ 爬虫线程库 """ def __init__(self): """ 全局数据控制 """ self.htmldb = HtmlDB() threading.Thread.__init__(self, name="reptilelib") print "... init ReptileLib ..." # 信号队列 由人机界面控制程序运行 self.inSignalQueue = Q.Queue() self.outSignalQueue = Q.Queue() self.Flock = threading.RLock() # 控制reptile线程是否运行 self.continueRun = [False] # 控制reptilelib 主程序及服务器是否运行 是否完全关闭 self.reptileLibRun = [True] # urlQueue and init in lib self.urlQueue = UrlQueue() self.urlist = Urlist() # 为了列表的共享性 初始的数据初始化[] 之后不能随意改变 self.homeUrls = [] self.pages = [] self.imagenum = [] self.imagenum.append(0) print "-" * 50 print ".. init self.imagenum", self.imagenum, type(self.imagenum) print "-" * 50 self.maxPages = [] self.reptilectrl = ReptileCtrl( homeUrls=self.homeUrls, continueRun=self.continueRun, urlist=self.urlist, urlQueue=self.urlQueue, maxPages=self.maxPages, pages=self.pages, imagenum=self.imagenum, outSignalQueue=self.outSignalQueue, ) self.controlserver = ControlServer(self.inSignalQueue, self.outSignalQueue) # run init thread self.runInit() def runInit(self): """ run init thread """ self.controlserver.start() self.start() def run(self): """ 运行主程序 signal: { type:type } """ print "... run while ..." while True: print ".. while ReptileLib running .." signal = self.inSignalQueue.get() print "get signal", signal _type = signal["type"] print "get type", _type if _type is "init": """ 全新运行 """ print ".. init from empty project .." self.init(homeUrls=signal["homeurls"], maxPages=signal["maxpages"], threadNum=signal["reptilenum"]) elif _type is "resume": print ".. resume from database .." self.reptilectrl.resume() elif _type is "stop": print ".. stop .." self.reptilectrl.stop() elif _type is "halt": print ".. halt .." self.reptilectrl.halt() elif _type is "status": """ ask for status """ print ".. status .." # put status in queue self.reptilectrl.status() elif _type is "start": """ run reptiles """ print ".. run reptile threads .." print "It works!" if not self.continueRun[0]: self.continueRun[0] = True self.initThreads() self.threadsRun() print "ReptileLib core stopped!" print "Reptile stopped" def init(self, homeUrls, maxPages, threadNum): """ 完全初始化 首次运行 注意: 重复init时,为了list的共享数据特性 每次需要清空[] 然后再重新赋值 """ def clearList(_List): if not _List: return _size = len(_List) for i in range(_size): _List.pop() def initList(_List, List): # first clear list clearList(_List) for l in List: print l _List.append(l) print ".. init homeUrls" initList(self.homeUrls, homeUrls) initList(self.maxPages, maxPages) self.threadNum = threadNum self.maxPages = maxPages print ".. init maxPages:", self.maxPages print ".. init pages", self.pages # self.htmldb = HtmlDB(self.htmlparser) # init self.pages # self.pages used to calculate num of pages downloaded clearList(self.pages) for i in range(len(homeUrls)): self.pages.append(0) # init urlQueue self.urlQueue.init(self.homeUrls) self.urlQueue.initFrontPage() # self.urlist.init(len(self.homeUrls)) # 存储 homeUrls self.htmldb.saveHomeUrls(homeUrls, maxPages, self.pages) def initThreads(self): self.thlist = [] # default: from site 0 print "$" * 50 print "init thread imagenum", self.imagenum, type(self.imagenum) print "$" * 50 for i in range(self.threadNum): # 此处前缀也需要变化 # 修改 根据站点前缀命名爬虫 th = Reptile( name="reptile%d" % i, urlQueue=self.urlQueue, urlist=self.urlist, Flock=self.Flock, homeUrls=self.homeUrls, maxPageNums=self.maxPages, pages=self.pages, imagenum=self.imagenum, continueRun=self.continueRun, ) self.thlist.append(th) def threadsRun(self): for th in self.thlist: th.start()