Beispiel #1
0
    def __init__(self):
        """
        全局数据控制
        """
        self.htmldb = HtmlDB()
        threading.Thread.__init__(self, name="reptilelib")
        print "... init ReptileLib ..."
        # 信号队列 由人机界面控制程序运行
        self.inSignalQueue = Q.Queue()
        self.outSignalQueue = Q.Queue()
        self.Flock = threading.RLock()

        # 控制reptile线程是否运行
        self.continueRun = [False]
        # 控制reptilelib 主程序及服务器是否运行 是否完全关闭
        self.reptileLibRun = [True]

        # urlQueue and init in lib
        self.urlQueue = UrlQueue()

        self.urlist = Urlist()
        # 为了列表的共享性 初始的数据初始化[] 之后不能随意改变
        self.homeUrls = []
        self.pages = []
        self.imagenum = []
        self.imagenum.append(0)
        print "-" * 50
        print ".. init self.imagenum", self.imagenum, type(self.imagenum)
        print "-" * 50
        self.maxPages = []

        self.reptilectrl = ReptileCtrl(
            homeUrls=self.homeUrls,
            continueRun=self.continueRun,
            urlist=self.urlist,
            urlQueue=self.urlQueue,
            maxPages=self.maxPages,
            pages=self.pages,
            imagenum=self.imagenum,
            outSignalQueue=self.outSignalQueue,
        )
        self.controlserver = ControlServer(self.inSignalQueue, self.outSignalQueue)
        # run init thread
        self.runInit()
Beispiel #2
0
class ReptileLib(threading.Thread):
    """
    爬虫线程库
    """

    def __init__(self):
        """
        全局数据控制
        """
        self.htmldb = HtmlDB()
        threading.Thread.__init__(self, name="reptilelib")
        print "... init ReptileLib ..."
        # 信号队列 由人机界面控制程序运行
        self.inSignalQueue = Q.Queue()
        self.outSignalQueue = Q.Queue()
        self.Flock = threading.RLock()

        # 控制reptile线程是否运行
        self.continueRun = [False]
        # 控制reptilelib 主程序及服务器是否运行 是否完全关闭
        self.reptileLibRun = [True]

        # urlQueue and init in lib
        self.urlQueue = UrlQueue()

        self.urlist = Urlist()
        # 为了列表的共享性 初始的数据初始化[] 之后不能随意改变
        self.homeUrls = []
        self.pages = []
        self.imagenum = []
        self.imagenum.append(0)
        print "-" * 50
        print ".. init self.imagenum", self.imagenum, type(self.imagenum)
        print "-" * 50
        self.maxPages = []

        self.reptilectrl = ReptileCtrl(
            homeUrls=self.homeUrls,
            continueRun=self.continueRun,
            urlist=self.urlist,
            urlQueue=self.urlQueue,
            maxPages=self.maxPages,
            pages=self.pages,
            imagenum=self.imagenum,
            outSignalQueue=self.outSignalQueue,
        )
        self.controlserver = ControlServer(self.inSignalQueue, self.outSignalQueue)
        # run init thread
        self.runInit()

    def runInit(self):
        """
        run init thread 
        """
        self.controlserver.start()
        self.start()

    def run(self):
        """
        运行主程序
        signal:
        {
            type:type
        }
        """
        print "... run while ..."

        while True:
            print ".. while ReptileLib running .."
            signal = self.inSignalQueue.get()
            print "get signal", signal
            _type = signal["type"]
            print "get type", _type

            if _type is "init":
                """
                全新运行
                """
                print ".. init from empty project .."
                self.init(homeUrls=signal["homeurls"], maxPages=signal["maxpages"], threadNum=signal["reptilenum"])

            elif _type is "resume":
                print ".. resume from database .."
                self.reptilectrl.resume()

            elif _type is "stop":
                print ".. stop .."
                self.reptilectrl.stop()

            elif _type is "halt":
                print ".. halt .."
                self.reptilectrl.halt()

            elif _type is "status":
                """
                ask for status
                """
                print ".. status .."
                # put status in queue
                self.reptilectrl.status()

            elif _type is "start":
                """
                run reptiles
                """
                print ".. run reptile threads .."
                print "It works!"
                if not self.continueRun[0]:
                    self.continueRun[0] = True
                    self.initThreads()
                    self.threadsRun()

        print "ReptileLib core stopped!"
        print "Reptile stopped"

    def init(self, homeUrls, maxPages, threadNum):
        """
        完全初始化
        首次运行
        注意: 重复init时,为了list的共享数据特性
        每次需要清空[] 然后再重新赋值
        """

        def clearList(_List):
            if not _List:
                return
            _size = len(_List)
            for i in range(_size):
                _List.pop()

        def initList(_List, List):
            # first clear list
            clearList(_List)
            for l in List:
                print l
                _List.append(l)

        print ".. init homeUrls"
        initList(self.homeUrls, homeUrls)
        initList(self.maxPages, maxPages)
        self.threadNum = threadNum
        self.maxPages = maxPages

        print ".. init maxPages:", self.maxPages
        print ".. init pages", self.pages

        # self.htmldb = HtmlDB(self.htmlparser)
        # init self.pages
        # self.pages used to calculate num of pages downloaded
        clearList(self.pages)
        for i in range(len(homeUrls)):
            self.pages.append(0)

        # init urlQueue
        self.urlQueue.init(self.homeUrls)
        self.urlQueue.initFrontPage()
        # self.urlist.init(len(self.homeUrls))

        # 存储 homeUrls
        self.htmldb.saveHomeUrls(homeUrls, maxPages, self.pages)

    def initThreads(self):
        self.thlist = []
        # default: from site 0
        print "$" * 50
        print "init thread imagenum", self.imagenum, type(self.imagenum)
        print "$" * 50

        for i in range(self.threadNum):
            # 此处前缀也需要变化
            # 修改  根据站点前缀命名爬虫
            th = Reptile(
                name="reptile%d" % i,
                urlQueue=self.urlQueue,
                urlist=self.urlist,
                Flock=self.Flock,
                homeUrls=self.homeUrls,
                maxPageNums=self.maxPages,
                pages=self.pages,
                imagenum=self.imagenum,
                continueRun=self.continueRun,
            )
            self.thlist.append(th)

    def threadsRun(self):
        for th in self.thlist:
            th.start()