def __init__(self): """ 全局数据控制 """ self.htmldb = HtmlDB() threading.Thread.__init__(self, name="reptilelib") print "... init ReptileLib ..." # 信号队列 由人机界面控制程序运行 self.inSignalQueue = Q.Queue() self.outSignalQueue = Q.Queue() self.Flock = threading.RLock() # 控制reptile线程是否运行 self.continueRun = [False] # 控制reptilelib 主程序及服务器是否运行 是否完全关闭 self.reptileLibRun = [True] # urlQueue and init in lib self.urlQueue = UrlQueue() self.urlist = Urlist() # 为了列表的共享性 初始的数据初始化[] 之后不能随意改变 self.homeUrls = [] self.pages = [] self.imagenum = [] self.imagenum.append(0) print "-" * 50 print ".. init self.imagenum", self.imagenum, type(self.imagenum) print "-" * 50 self.maxPages = [] self.reptilectrl = ReptileCtrl( homeUrls=self.homeUrls, continueRun=self.continueRun, urlist=self.urlist, urlQueue=self.urlQueue, maxPages=self.maxPages, pages=self.pages, imagenum=self.imagenum, outSignalQueue=self.outSignalQueue, ) self.controlserver = ControlServer(self.inSignalQueue, self.outSignalQueue) # run init thread self.runInit()
class ReptileLib(threading.Thread): """ 爬虫线程库 """ def __init__(self): """ 全局数据控制 """ self.htmldb = HtmlDB() threading.Thread.__init__(self, name="reptilelib") print "... init ReptileLib ..." # 信号队列 由人机界面控制程序运行 self.inSignalQueue = Q.Queue() self.outSignalQueue = Q.Queue() self.Flock = threading.RLock() # 控制reptile线程是否运行 self.continueRun = [False] # 控制reptilelib 主程序及服务器是否运行 是否完全关闭 self.reptileLibRun = [True] # urlQueue and init in lib self.urlQueue = UrlQueue() self.urlist = Urlist() # 为了列表的共享性 初始的数据初始化[] 之后不能随意改变 self.homeUrls = [] self.pages = [] self.imagenum = [] self.imagenum.append(0) print "-" * 50 print ".. init self.imagenum", self.imagenum, type(self.imagenum) print "-" * 50 self.maxPages = [] self.reptilectrl = ReptileCtrl( homeUrls=self.homeUrls, continueRun=self.continueRun, urlist=self.urlist, urlQueue=self.urlQueue, maxPages=self.maxPages, pages=self.pages, imagenum=self.imagenum, outSignalQueue=self.outSignalQueue, ) self.controlserver = ControlServer(self.inSignalQueue, self.outSignalQueue) # run init thread self.runInit() def runInit(self): """ run init thread """ self.controlserver.start() self.start() def run(self): """ 运行主程序 signal: { type:type } """ print "... run while ..." while True: print ".. while ReptileLib running .." signal = self.inSignalQueue.get() print "get signal", signal _type = signal["type"] print "get type", _type if _type is "init": """ 全新运行 """ print ".. init from empty project .." self.init(homeUrls=signal["homeurls"], maxPages=signal["maxpages"], threadNum=signal["reptilenum"]) elif _type is "resume": print ".. resume from database .." self.reptilectrl.resume() elif _type is "stop": print ".. stop .." self.reptilectrl.stop() elif _type is "halt": print ".. halt .." self.reptilectrl.halt() elif _type is "status": """ ask for status """ print ".. status .." # put status in queue self.reptilectrl.status() elif _type is "start": """ run reptiles """ print ".. run reptile threads .." print "It works!" if not self.continueRun[0]: self.continueRun[0] = True self.initThreads() self.threadsRun() print "ReptileLib core stopped!" print "Reptile stopped" def init(self, homeUrls, maxPages, threadNum): """ 完全初始化 首次运行 注意: 重复init时,为了list的共享数据特性 每次需要清空[] 然后再重新赋值 """ def clearList(_List): if not _List: return _size = len(_List) for i in range(_size): _List.pop() def initList(_List, List): # first clear list clearList(_List) for l in List: print l _List.append(l) print ".. init homeUrls" initList(self.homeUrls, homeUrls) initList(self.maxPages, maxPages) self.threadNum = threadNum self.maxPages = maxPages print ".. init maxPages:", self.maxPages print ".. init pages", self.pages # self.htmldb = HtmlDB(self.htmlparser) # init self.pages # self.pages used to calculate num of pages downloaded clearList(self.pages) for i in range(len(homeUrls)): self.pages.append(0) # init urlQueue self.urlQueue.init(self.homeUrls) self.urlQueue.initFrontPage() # self.urlist.init(len(self.homeUrls)) # 存储 homeUrls self.htmldb.saveHomeUrls(homeUrls, maxPages, self.pages) def initThreads(self): self.thlist = [] # default: from site 0 print "$" * 50 print "init thread imagenum", self.imagenum, type(self.imagenum) print "$" * 50 for i in range(self.threadNum): # 此处前缀也需要变化 # 修改 根据站点前缀命名爬虫 th = Reptile( name="reptile%d" % i, urlQueue=self.urlQueue, urlist=self.urlist, Flock=self.Flock, homeUrls=self.homeUrls, maxPageNums=self.maxPages, pages=self.pages, imagenum=self.imagenum, continueRun=self.continueRun, ) self.thlist.append(th) def threadsRun(self): for th in self.thlist: th.start()