def mqttConnect(): global args global mqttc try: if args.mqtt311_protocol: ptcol = mosquitto.MQTTv311 else: ptcol = mosquitto.MQTTv31 mqttc = mosquitto.Client("adsbclient-%d" % (random.randint(0, 65535)), protocol=ptcol) mqttc.on_message = mqttOnMessage mqttc.on_connect = mqttOnConnect mqttc.on_disconnect = mqttOnDisconnect mqttc.on_publish = mqttOnPublish mqttc.on_subscribe = mqttOnSubscribe if args.mqtt_user and args.mqtt_password: mqttc.username_pw_set(args.mqtt_user, password=args.mqtt_password) mqttc.connect(args.mqtt_host, args.mqtt_port, 60) thread = Thread(target=mqttThread) thread.setDaemon(True) thread.start() return True except socket.error, e: return False
def runWorker(argv): global server global workdir init(argv) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) ##server.setsockopt(socket.AF_INET, socket.SOCK_STREAM) server.bind(('0.0.0.0', 2830)) server.listen(5) ##-->monitor_thread = threading.Thread(target=monitor) ##-->monitor_thread.setDaemon(True) ##-->monitor_thread.start() while exitFlag == False: (sock, address) = server.accept() if exitFlag == True: #sock.send("q\n") #sock.shutdown(socket.SHUT_RDWR) #sock.close() server.close() break thread = Worker(sock) thread.setDaemon(1) thread.start() print 'exitting...'
def Start(self, url): url = url.replace("https", "wss") url = url.replace("http", "ws") url = url + "/socket.io/?EIO=3&transport=websocket" def Run(): # websocket.enableTrace(True) ws = websocket.WebSocketApp(url, on_open=self.on_open, on_message=self.on_message, on_error=self.on_error, on_close=self.on_close) self.ws = ws if config.HttpProxy: data = config.HttpProxy.split(":") if len(data) == 3: port = data[2] host = data[1].replace("//", "") else: host = data port = 80 ws.run_forever(http_proxy_host=host, http_proxy_port=port) else: ws.run_forever() thread = threading.Thread(target=Run) thread.setDaemon(True) thread.start()
def calculateAverage(period, classname): now = datetime.datetime.utcnow().replace(tzinfo=utc) round_now = now - datetime.timedelta(seconds=now.second, microseconds=now.microsecond) for server in Server.objects.all().select_related(): try: threads = [] for probe in server.probes.exclude(graph_type__name__in=['text']): thread = threading.Thread(target=calculateAveragesForPeriod, args=[period, classname, server, probe], name="SkwisshAverage.%s.%s" % (classname.__name__, probe.display_name.encode('utf-8').replace(" ", "_"))) thread.setDaemon(False) thread.start() threads.append(thread) for thread in threads: thread.join() end = datetime.datetime.utcnow().replace(tzinfo=utc) total_time = end - now duration = float(int((total_time.seconds * 1000000) + total_time.microseconds) / 1000000.0) success = True message = "Calculated averages values for last %d minutes (server %s)" % (period, server.hostname) except: success = False message = traceback.format_exc() CronLog.objects.create(timestamp=round_now, action="average %dmin" % period, server=server, success=success, duration=duration, message=message)
def main(): logger = logging.getLogger('default') logger.setLevel(logging.CRITICAL) tracks = getTrackDocuments() print 'Found %d tracks with no POIs' % len(tracks) workQueue = Queue.Queue() resultQueue = Queue.Queue() for track in tracks: workQueue.put(track) threads = [] for i in range(NUM_THREAD_WORKER): thread = SparqlThread(time.time(), len(tracks), workQueue, resultQueue) thread.setDaemon(True) threads.append(thread) thread.start() writer = WriterThread(resultQueue, threads) writer.setDaemon(True) writer.start() while len(threads) > 0: try: threads = [t.join(1000) for t in threads if t is not None and t.isAlive] except KeyboardInterrupt: print 'Abort' break
def start(self, *args, **kwargs): """Start the task. This is: * not threadsave * assumed to be called in the gtk mainloop """ args = (self.counter,) + args thread = threading.Thread( target=self._work_callback, args=args, kwargs=kwargs ) thread.setDaemon(self.daemon) thread.start()
def main(): if init() == False: return binlog_handler_manage = BinlogHandlerManage(global_binlog_data_manage) binlog_handler1 = BinlogHandler("test_handler") binlog_handler_manage.add_last_handler(binlog_handler1) thread = HandlerThread(global_signal, global_binlog_data_manage, binlog_handler_manage) thread.setDaemon(True) thread.start() start_sync()
def analyze(self, queryStringList): if type(queryStringList) != list: queryStringList = [queryStringList] try: for item in queryStringList: self.queue.put(item) for i in range(self.numberOfThreads): thread = ThreadHelper(self.queue, self.report, self.lock) thread.setDaemon(True) thread.start() self.queue.join() except Exception, e: logging.error(e)
def go(self): self.timeLog("日志启动于 %s" % self.getStartRunningTime().strftime(self.TimeFormatForLog)) self.timeLog("开始cancel pending orders") self.huobi_cancel_pending_orders() self.timeLog("完成cancel pending orders") thread_pool = [] thread_pool.append(Thread(target=self.trade_thread, args=())) if self.need_rebalance: spot_rebalance = SpotRebalance(self.heart_beat_time, self.coinMarketType, depth_data=self.depth_data, transaction_info=self.order_info_queue) thread_pool.append(Thread(target=spot_rebalance.go, args=())) for thread in thread_pool: thread.setDaemon(True) thread.start() for thread in thread_pool: thread.join()
def createDemo3(): threads = [] for i in range(10): threads.append(threading.Thread(target=run_thread, name="thread-" + str(i), args=(15,))) for thread in threads: thread.setDaemon(True) thread.start() # print "threadName:%s" % thread.getName() # 主线程等待所有子线程结束 for childThread in threads: # threading.Thread.join(childThread) childThread.join() for i in range(10): print "\nhahah%s" % i
def Start(self, url): url = url.replace("https", "wss") url = url.replace("http", "ws") url = url + "/socket.io/?EIO=3&transport=websocket" def Run(): # websocket.enableTrace(True) ws = websocket.WebSocketApp(url, on_open=self.on_open, on_message=self.on_message, on_error=self.on_error, on_close=self.on_close) self.ws = ws ws.run_forever() thread = threading.Thread(target=Run) thread.setDaemon(True) thread.start()
def _start_queue_thread_get_link_file(self): while self._queue_get_link: args = tuple(self._queue_get_link.popleft(), ) try: while threading.activeCount() >= MAX_THREAD: activeThreads = threading.activeCount() print activeThreads time.sleep(activeThreads / 10) thread = threading.Thread(target=self._thread_get_link_file, args=args) thread.setDaemon(True) thread.start() # thread.join() time.sleep(0.1) except Exception as e: print e self._thread_get_link = None #done
def mqttConnect(): global args global mqttc try: mqttc = mosquitto.Mosquitto("adsbclient-%d" % (random.randint(0, 65535))) mqttc.on_message = mqttOnMessage mqttc.on_connect = mqttOnConnect mqttc.on_disconnect = mqttOnDisconnect mqttc.on_publish = mqttOnPublish mqttc.on_subscribe = mqttOnSubscribe if args.mqtt_user and args.mqtt_password: mqttc.username_pw_set(args.mqtt_user, password = args.mqtt_password) mqttc.connect(args.mqtt_host, args.mqtt_port, 60) thread = Thread(target = mqttThread) thread.setDaemon(True) thread.start() return True except socket.error, e: return False
def threaded_get(url=None, urls=None, num_threads=10, cb=None, post=False, depth=False, **kwargs): """Download these urls in parallel `url[s]' are the webpages to download `num_threads' determines the number of threads to download urls with `cb' is called after each download with the HTML of the download the arguments are the url and downloaded html whatever URLs are returned are added to the crawl queue `post' is whether to use POST instead of default GET `depth' sets to traverse depth first rather than the default breadth first """ cache = kwargs.pop('cache', None) if cache: common.logger.debug('Making a copy of the cache for each thread') class DownloadThread(threading.Thread): """Download data """ processing = collections.deque() def __init__(self): threading.Thread.__init__(self) def run(self): new_cache = None if cache: new_cache = copy.copy(cache) D = Download(cache=new_cache, **kwargs) while urls or DownloadThread.processing: # keep track that are processing url DownloadThread.processing.append(1) try: if depth: url = urls.popleft() else: url = urls.pop() except IndexError: # currently no urls to process DownloadThread.processing.popleft() # so check again later time.sleep(SLEEP_TIME) else: # download this url try: html = (D.post if post else D.get)(url, **kwargs) if cb: # use callback to process downloaded HTML urls.extend(cb(D, url, html) or []) finally: # have finished processing # make sure this is called even on exception DownloadThread.processing.popleft() # put urls into thread safe queue urls = urls or [] if url: urls.append(url) urls = collections.deque(urls) threads = [DownloadThread() for i in range(num_threads)] for thread in threads: thread.setDaemon(True) # set daemon so can exit with ctrl-c thread.start() # wait for threads to finish while threading.active_count() > 1: time.sleep(SLEEP_TIME)
def main_loop(self): thread = threading.Thread(target=self._actual_main_loop) thread.setDaemon( True) # Main program should not stop quitting if only this is left thread.start()
def main_loop(self): thread = threading.Thread(target=self._actual_main_loop) thread.setDaemon(True) # Main program should not stop quitting if only this is left thread.start()
def adsbConnect(): thread = Thread(target=adsbThread) thread.setDaemon(True) thread.start()
def start (self): thread = Thread (target = self.run) # Consumer can choose whether to wait for threads to complete: thread.setDaemon (True) thread.start()
def threaded_get(url=None, urls=None, url_iter=None, num_threads=10, dl=None, cb=None, depth=True, **kwargs): """Download these urls in parallel url: the webpage to download urls: the webpages to download num_threads: the number of threads to download urls with cb: Called after each download with the HTML of the download. The arguments are the url and downloaded html. Whatever URLs are returned are added to the crawl queue. dl: A callback for customizing the download. Takes the download object and url and should return the HTML. depth: True for depth first search """ running = True lock = threading.Lock() def add_iter_urls(): if lock.acquire(False): for url in url_iter or []: download_queue.append(url) break lock.release() def process_queue(): """Thread for downloading webpages """ D = Download(**kwargs) while True: try: url = download_queue.pop( ) if depth else download_queue.popleft() except IndexError: add_iter_urls() break else: # download this url html = dl(D, url, **kwargs) if dl else D.get(url, **kwargs) if cb: try: # use callback to process downloaded HTML result = cb(D, url, html) except StopCrawl: common.logger.info('Stopping crawl signal') self.running = False except Exception: # catch any callback error to avoid losing thread common.logger.exception('\nIn callback for: ' + str(url)) else: # add these URL's to crawl queue for link in result or []: download_queue.append(link) # update the crawler state # no download or error so must have read from cache num_caches = 0 if D.num_downloads or D.num_errors else 1 state.update(num_downloads=D.num_downloads, num_errors=D.num_errors, num_caches=num_caches, queue_size=len(download_queue)) download_queue = collections.deque() if urls: download_queue.extend(urls) if url: download_queue.append(url) common.logger.debug('Start new crawl') # initiate the state file with the number of URL's already in the queue state = State() state.update(queue_size=len(download_queue)) # wait for all download threads to finish threads = [] while running and (threads or download_queue): for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < num_threads and download_queue: # cat start more threads thread = threading.Thread(target=process_queue) thread.setDaemon( True ) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME) # save the final state after threads finish state.save()
def run_async(self, sockopt=None, sslopt=None, ping_interval=0, ping_timeout=None, http_proxy_host=None, http_proxy_port=None, http_no_proxy=None, http_proxy_auth=None, skip_utf8_validation=False, host=None, origin=None): """ run event loop for WebSocket framework. This loop is infinite loop and is alive during websocket is available. sockopt: values for socket.setsockopt. sockopt must be tuple and each element is argument of sock.setsockopt. sslopt: ssl socket optional dict. ping_interval: automatically send "ping" command every specified period(second) if set to 0, not send automatically. ping_timeout: timeout(second) if the pong message is not received. http_proxy_host: http proxy host name. http_proxy_port: http proxy port. If not set, set to 80. http_no_proxy: host names, which doesn't use proxy. skip_utf8_validation: skip utf8 validation. host: update host header. origin: update origin header. """ if not ping_timeout or ping_timeout <= 0: ping_timeout = None if ping_timeout and ping_interval and ping_interval <= ping_timeout: raise WebSocketException("Ensure ping_interval > ping_timeout") if sockopt is None: sockopt = [] if sslopt is None: sslopt = {} if self.sock: raise WebSocketException("socket is already opened") thread = None close_frame = None try: logging.debug("Starting") self.sock = websocket.WebSocket( self.get_mask_key, sockopt=sockopt, sslopt=sslopt, fire_cont_frame=self.on_cont_message and True or False, skip_utf8_validation=skip_utf8_validation) logging.debug("Created socket") self.sock.settimeout(websocket.getdefaulttimeout()) logging.debug("Connecting") self.sock.connect( self.url, header=self.header, cookie=self.cookie, http_proxy_host=http_proxy_host, http_proxy_port=http_proxy_port, http_no_proxy=http_no_proxy, http_proxy_auth=http_proxy_auth, subprotocols=self.subprotocols, host=host, origin=origin) logging.debug("Calling on open") self._callback(self.on_open) if ping_interval: event = threading.Event() thread = threading.Thread( target=self._send_ping, args=(ping_interval, event)) thread.setDaemon(True) thread.start() logging.debug("Entering mainloop") while self.sock.connected: logging.debug("Background websocket client calling select") r, w, e = select.select( (self.sock.sock, self.dispatch_queue.read_pipe), (), (), ping_timeout) if not self.keep_running: break if r and self.dispatch_queue.read_pipe in r: logging.debug("Background websocket client running queued jobs") n = self.dispatch_queue.run_jobs() if r and self.sock.sock in r: op_code, frame = self.sock.recv_data_frame(True) if op_code == websocket.ABNF.OPCODE_CLOSE: close_frame = frame break elif op_code == websocket.ABNF.OPCODE_PING: self._callback(self.on_ping, frame.data) elif op_code == websocket.ABNF.OPCODE_PONG: self.last_pong_tm = time.time() self._callback(self.on_pong, frame.data) elif op_code == websocket.ABNF.OPCODE_CONT and self.on_cont_message: self._callback(self.on_data, data, frame.opcode, frame.fin) self._callback(self.on_cont_message, frame.data, frame.fin) else: data = frame.data if six.PY3 and op_code == websocket.ABNF.OPCODE_TEXT: data = data.decode("utf-8") self._callback(self.on_data, data, frame.opcode, True) self._callback(self.on_message, data) if ping_timeout and self.last_ping_tm \ and time.time() - self.last_ping_tm > ping_timeout \ and self.last_ping_tm - self.last_pong_tm > ping_timeout: raise WebSocketTimeoutException("ping/pong timed out") logging.debug("While loop exited") except (Exception, KeyboardInterrupt, SystemExit) as e: traceback.print_exc() self._callback(self.on_error, e) if isinstance(e, SystemExit): # propagate SystemExit further raise finally: logging.debug("Everything has gone to shit") if thread and thread.isAlive(): event.set() thread.join() self.keep_running = False if self.sock is not None: self.sock.close() close_args = self._get_close_args( close_frame.data if close_frame else None) self._callback(self.on_close, *close_args) self.sock = None
def threaded_get(url=None, urls=None, url_iter=None, num_threads=10, dl=None, cb=None, depth=True, **kwargs): """Download these urls in parallel url: the webpage to download urls: the webpages to download num_threads: the number of threads to download urls with cb: Called after each download with the HTML of the download. The arguments are the url and downloaded html. Whatever URLs are returned are added to the crawl queue. dl: A callback for customizing the download. Takes the download object and url and should return the HTML. depth: True for depth first search """ running = True lock = threading.Lock() def add_iter_urls(): if lock.acquire(False): for url in url_iter or []: download_queue.append(url) break lock.release() def process_queue(): """Thread for downloading webpages """ D = Download(**kwargs) while True: try: url = download_queue.pop() if depth else download_queue.popleft() except IndexError: add_iter_urls() break else: # download this url html = dl(D, url, **kwargs) if dl else D.get(url, **kwargs) if cb: try: # use callback to process downloaded HTML result = cb(D, url, html) except StopCrawl: common.logger.info('Stopping crawl signal') running = False except Exception: # catch any callback error to avoid losing thread common.logger.exception('\nIn callback for: ' + str(url)) else: # add these URL's to crawl queue for link in result or []: download_queue.append(urlparse.urljoin(url, link)) # update the crawler state # no download or error so must have read from cache num_caches = 0 if D.num_downloads or D.num_errors else 1 state.update(num_downloads=D.num_downloads, num_errors=D.num_errors, num_caches=num_caches, queue_size=len(download_queue)) download_queue = collections.deque() if urls: download_queue.extend(urls) if url: download_queue.append(url) common.logger.debug('Start new crawl') # initiate the state file with the number of URL's already in the queue state = State() state.update(queue_size=len(download_queue)) # wait for all download threads to finish threads = [] while running and (threads or download_queue): for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < num_threads and download_queue: # cat start more threads thread = threading.Thread(target=process_queue) thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME) # save the final state after threads finish state.save()
def setDaemon(thread): # Reference: http://stackoverflow.com/questions/190010/daemon-threads-explanation if PYVERSION >= "2.6": thread.daemon = True else: thread.setDaemon(True)
def adsbConnect(): thread = Thread(target = adsbThread) thread.setDaemon(True) thread.start()
def threaded_get(url=None, urls=None, num_threads=10, dl=None, cb=None, depth=None, wait_finish=True, reuse_queue=False, max_queue=1000, **kwargs): """Download these urls in parallel url: the webpage to download urls: the webpages to download num_threads: the number of threads to download urls with cb: Called after each download with the HTML of the download. The arguments are the url and downloaded html. Whatever URLs are returned are added to the crawl queue. dl: A callback for customizing the download. Takes the download object and url and should return the HTML. depth: Deprecated - will be removed in later version wait_finish: whether to wait until all download threads have finished before returning reuse_queue: Whether to continue the queue from the previous run. max_queue: The maximum number of queued URLs to keep in memory. The rest will be in the cache. """ if kwargs.pop('cache', None): common.logger.debug('threaded_get does not support cache flag') lock = threading.Lock() class DownloadThread(threading.Thread): """Thread for downloading webpages """ processing = collections.deque() # to track whether are still downloading discovered = {} # the URL's that have been discovered def __init__(self): threading.Thread.__init__(self) def run(self): D = Download(**kwargs) queue = pdict.Queue(settings.queue_file) while seed_urls or DownloadThread.processing: # keep track that are processing url DownloadThread.processing.append(1) try: url = seed_urls.pop() except IndexError: # currently no urls to process DownloadThread.processing.popleft() # so check again later time.sleep(SLEEP_TIME) else: try: # download this url html = dl(D, url, **kwargs) if dl else D.get(url, **kwargs) if cb: try: # use callback to process downloaded HTML result = cb(D, url, html) except Exception, e: # catch any callback error to avoid losing thread common.logger.exception('\nIn callback for: ' + str(url)) else: # add these URL's to crawl queue for link in result or []: cb_url = urlparse.urljoin(url, link) if isinstance(result, dict): DownloadThread.discovered[cb_url] = result[link] else: DownloadThread.discovered[cb_url] = DEFAULT_PRIORITY if len(seed_urls) < max_queue: # need to request more queue if DownloadThread.discovered or len(queue) > 0: # there are outstanding in the queue if lock.acquire(False): # no other thread is downloading common.logger.debug('Loading from queue: %d' % len(seed_urls)) discovered = [] while DownloadThread.discovered: discovered.append(DownloadThread.discovered.popitem()) queue.push(discovered) # get next batch of URLs from cache seed_urls.extend(queue.pull(limit=max_queue)) lock.release() finally: # have finished processing # make sure this is called even on exception to avoid eternal loop DownloadThread.processing.pop() # update the crawler state # no download or error so must have read from cache num_caches = 0 if D.num_downloads or D.num_errors else 1 state.update(num_downloads=D.num_downloads, num_errors=D.num_errors, num_caches=num_caches, queue_size=len(queue)) queue = pdict.Queue(settings.queue_file) if reuse_queue: # command line flag to enable queue queued_urls = queue.pull(limit=max_queue) else: queued_urls = [] if queued_urls: # continue the previous crawl seed_urls = collections.deque(queued_urls) common.logger.debug('Loading crawl queue') else: # remove any queued URL's so can crawl again queue.clear() urls = urls or [] if url: urls.append(url) queue.push([(url, DEFAULT_PRIORITY) for url in urls]) # put urls into thread safe queue seed_urls = collections.deque(queue.pull(limit=max_queue)) common.logger.debug('Start new crawl') # initiate the state file with the number of URL's already in the queue state = State() state.update(queue_size=len(queue)) # start the download threads threads = [DownloadThread() for i in range(num_threads)] for thread in threads: thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c thread.start() # Wait for all download threads to finish while threads and wait_finish: for thread in threads: if not thread.is_alive(): threads.remove(thread) time.sleep(SLEEP_TIME) # save the final state after threads finish state.save()
total.append(line) threadLimiter.release() # Copy inaccessible hosts into this down_hosts = [] try: with open(argv[2]) as down: down_hosts_raw = down.readlines() down_hosts = [line[2:] for line in down_hosts_raw] for l in down_hosts: line = l[:-1] threadLimiter.acquire() signal.signal(signal.SIGINT, signal_handler) thread = threading.Thread(target=run, args=(line, )) thread.setDaemon(True) thread.start() except ExitCommand: pass finally: with open('failed.txt', 'w') as failed_hosts_file: for host in failed: failed_hosts_file.write("- {}\n".format(host)) with open('success.txt', 'w') as successful_hosts_file: for host in success: successful_hosts_file.write("- {}\n".format(host)) with open('processed_hosts.txt', 'w') as processed_hosts_file: for host in processed: processed_hosts_file.write("- {}\n".format(host)) logging.warning("The following hosts failed to reauthenticate! {}".format( failed, ))
def threaded_get(url=None, urls=None, num_threads=10, dl=None, cb=None, depth=None, wait_finish=True, reuse_queue=False, max_queue=1000, **kwargs): """Download these urls in parallel url: the webpage to download urls: the webpages to download num_threads: the number of threads to download urls with cb: Called after each download with the HTML of the download. The arguments are the url and downloaded html. Whatever URLs are returned are added to the crawl queue. dl: A callback for customizing the download. Takes the download object and url and should return the HTML. depth: Deprecated - will be removed in later version wait_finish: whether to wait until all download threads have finished before returning use_queue: Whether to continue the queue from the previous run. max_queue: The maximum number of queued URLs to keep in memory. The rest will be in the cache. """ if kwargs.pop('cache', None): common.logger.debug('threaded_get does not support cache flag') lock = threading.Lock() class DownloadThread(threading.Thread): """Thread for downloading webpages """ processing = collections.deque( ) # to track whether are still downloading discovered = {} # the URL's that have been discovered def __init__(self): threading.Thread.__init__(self) def run(self): D = Download(**kwargs) queue = pdict.Queue(settings.queue_file) while seed_urls or DownloadThread.processing: # keep track that are processing url DownloadThread.processing.append(1) try: url = seed_urls.pop() except IndexError: # currently no urls to processa DownloadThread.processing.popleft() # so check again later time.sleep(SLEEP_TIME) else: try: # download this url html = dl(D, url, **kwargs) if dl else D.get( url, **kwargs) if cb: try: # use callback to process downloaded HTML cb_urls = cb(D, url, html) except Exception, e: # catch any callback error to avoid losing thread common.logger.error('in callback for: ' + str(url) + '\n' + traceback.format_exc()) else: # add these URL's to crawl queue for cb_url in cb_urls or []: if isinstance(cb_urls, dict): DownloadThread.discovered[ cb_url] = cb_urls[cb_url] else: DownloadThread.discovered[ cb_url] = DEFAULT_PRIORITY if len(seed_urls) < max_queue: # need to request more queue if DownloadThread.discovered or len( queue) > 0: # there are outstanding in the queue if lock.acquire(False): # no other thread is downloading common.logger.debug( 'Loading from queue: %d' % len(seed_urls)) discovered = [] while DownloadThread.discovered: discovered.append( DownloadThread.discovered. popitem()) queue.push(discovered) # get next batch of URLs from cache seed_urls.extend( queue.pull(limit=max_queue)) lock.release() """ for cb_url in cb_urls or []: if cb_url not in DownloadThread.discovered: DownloadThread.discovered[cb_url] = 1 seed_urls.append(cb_url) """ finally: # have finished processing # make sure this is called even on exception to avoid eternal loop DownloadThread.processing.pop() # update the crawler state # no download or error so must have read from cache num_caches = 0 if D.num_downloads or D.num_errors else 1 state.update(num_downloads=D.num_downloads, num_errors=D.num_errors, num_caches=num_caches, queue_size=len(queue)) queue = pdict.Queue(settings.queue_file) if reuse_queue: # command line flag to enable queue queued_urls = queue.pull(limit=max_queue) else: queued_urls = [] if queued_urls: # continue the previous crawl seed_urls = collections.deque(queued_urls) common.logger.debug('Loading crawl queue') else: # remove any queued URL's so can crawl again queue.clear() urls = urls or [] if url: urls.append(url) queue.push([(url, DEFAULT_PRIORITY) for url in urls]) # put urls into thread safe queue seed_urls = collections.deque(queue.pull(limit=max_queue)) common.logger.debug('Start new crawl') # initiate the state file with the number of URL's already in the queue state = State() state.update(queue_size=len(queue)) # start the download threads threads = [DownloadThread() for i in range(num_threads)] for thread in threads: thread.setDaemon( True) # set daemon so main thread can exit when receives ctrl-c thread.start() # Wait for all download threads to finish while threads and wait_finish: for thread in threads: if not thread.is_alive(): threads.remove(thread) time.sleep(SLEEP_TIME) # save the final state after threads finish state.save()