def set_state(self, state): """ Set state to a previous saved state """ # Get base url object self._baseUrlObj = state.get('_baseUrlObj') # If base url object is None, we cannot proceed # so return -1 if self._baseUrlObj is None: return -1 # Set state for simple data-members self._pushes = state.get('_pushes', 0) self._lockedinst = state.get('_lockedinst', 0) self._lasttimestamp = state.get('_lasttimestamp', time.time()) self._requests = state.get('_requests', 0) self._lastblockedtime = state.get('_lastblockedtime', 0) self.buffer = state.get('buffer', []) # Set state for queues self.url_q.queue = state.get('url_q', MyDeque()) self.data_q.queue = state.get('data_q', MyDeque()) # If both queues are empty, we don't have anything to do if len(self.url_q.queue) == 0 and len(self.data_q.queue) == 0: moreinfo('Size of data/url queues are zero, nothing to re-run') return -1 cfg = GetObject('config') self._configobj = cfg if cfg.fastmode: # Create threads and set their state for idx, tdict in state.get('threadinfo').items(): role = tdict.get('role') t = None if role == 'fetcher': t = crawler.HarvestManUrlFetcher(idx, None) self._numfetchers += 1 elif role == 'crawler': t = crawler.HarvestManUrlCrawler(idx, None) t.links = tdict.get('links') self._numcrawlers += 1 if t: t._status = tdict.get('_status') t._loops = tdict.get('_loops') t._url = tdict.get('_url') t._urlobject = tdict.get('_urlobject') t.buffer = tdict.get('buffer') if t._urlobject: t._resuming = True self.add_tracker(t) t.setDaemon(True) # Set base tracker self._basetracker = self._trackers[0]
def dead_thread_callback(self, t): """ Call back function called by a thread if it dies with an exception. This class then creates a fresh thread, migrates the data of the dead thread to it """ try: self._cond.acquire() # First find out the type role = t.get_role() new_t = None if role == 'fetcher': new_t = crawler.HarvestManUrlFetcher(t.get_index(), None) elif role == 'crawler': new_t = crawler.HarvestManUrlCrawler(t.get_index(), None) # Migrate data and start thread if new_t: new_t._url = t._url new_t._urlobject = t._urlobject new_t.buffer = copy.deepcopy(t.buffer) # If this is a crawler get links also if role == 'crawler': new_t.links = t.links[:] # Replace dead thread in the list idx = self._trackers.index(t) self._trackers[idx] = new_t new_t._resuming = True new_t.start() time.sleep(2.0) return 0 else: # Could not make new thread, so decrement # count of threads. # Remove from tracker list self._trackers.remove(t) if role == 'fetcher': self._numfetchers -= 1 elif role == 'crawler': self._numcrawlers -= 1 return -1 finally: self._cond.release()
def crawl(self): """ Starts crawling for this project """ # Reset flag self._flag = 0 # Clear the event flag # self.exitobj.clear() if os.name == 'nt': t1 = time.clock() else: t1 = time.time() # Set start time on config object self._configobj.starttime = t1 if not self._configobj.urlserver: self.push(self._baseUrlObj, 'crawler') else: try: # Flush url server of any previous urls by # sending a flush command. send_url("flush", self._configobj.urlhost, self._configobj.urlport) send_url(str(self._baseUrlObj.index), self._configobj.urlhost, self._configobj.urlport) except: pass if self._configobj.fastmode: # Start harvestman controller thread import datamgr self._controller = datamgr.harvestManController() self._controller.start() # Create the number of threads in the config file # Pre-launch the number of threads specified # in the config file. # Initialize thread dictionary self._basetracker.setDaemon(True) self._basetracker.start() while self._basetracker.get_status() != 0: time.sleep(0.1) for x in range(1, self._configobj.maxtrackers): # Back to equality among threads if x % 2 == 0: t = crawler.HarvestManUrlFetcher(x, None) else: t = crawler.HarvestManUrlCrawler(x, None) self.add_tracker(t) t.setDaemon(True) t.start() for t in self._trackers: if t.get_role() == 'fetcher': self._numfetchers += 1 elif t.get_role() == 'crawler': self._numcrawlers += 1 # bug: give the threads some time to start, # otherwise we exit immediately sometimes. time.sleep(2.0) self.mainloop() # Set flag to 1 to denote that downloading is finished. self._flag = 1 self.stop_threads(noexit=True) else: self._basetracker.action()
def crawl(self): """ Starts crawling for this project """ # Reset flag self._flag = 0 t1 = time.time() # Set start time on config object self._configobj.starttime = t1 self.push(self._baseUrlObj, 'crawler') if self._configobj.fastmode: # Start harvestman controller thread import datamgr self._controller = datamgr.HarvestManController() self._controller.start() # Create the number of threads in the config file # Pre-launch the number of threads specified # in the config file. # Initialize thread dictionary self._basetracker.setDaemon(True) self._basetracker.start() while self._basetracker.get_status() != 0: time.sleep(0.1) for x in range(1, self._configobj.maxtrackers): # Back to equality among threads if x % 2 == 0: t = crawler.HarvestManUrlFetcher(x, None) else: t = crawler.HarvestManUrlCrawler(x, None) self.add_tracker(t) t.setDaemon(True) t.start() for t in self._trackers: if t.get_role() == 'fetcher': self._numfetchers += 1 elif t.get_role() == 'crawler': self._numcrawlers += 1 # bug: give the threads some time to start, # otherwise we exit immediately sometimes. time.sleep(2.0) self.mainloop() # Set flag to 1 to denote that downloading is finished. self._flag = 1 self.stop_threads(noexit=True) else: self._basetracker.action()