Beispiel #1
0
def mqttConnect():
    global args
    global mqttc
    try:
        if args.mqtt311_protocol:
            ptcol = mosquitto.MQTTv311
        else:
            ptcol = mosquitto.MQTTv31
        mqttc = mosquitto.Client("adsbclient-%d" % (random.randint(0, 65535)),
                                 protocol=ptcol)
        mqttc.on_message = mqttOnMessage
        mqttc.on_connect = mqttOnConnect
        mqttc.on_disconnect = mqttOnDisconnect
        mqttc.on_publish = mqttOnPublish
        mqttc.on_subscribe = mqttOnSubscribe

        if args.mqtt_user and args.mqtt_password:
            mqttc.username_pw_set(args.mqtt_user, password=args.mqtt_password)

        mqttc.connect(args.mqtt_host, args.mqtt_port, 60)

        thread = Thread(target=mqttThread)
        thread.setDaemon(True)
        thread.start()
        return True
    except socket.error, e:
        return False
Beispiel #2
0
def runWorker(argv):
    global server
    global workdir

    init(argv)

    server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    ##server.setsockopt(socket.AF_INET, socket.SOCK_STREAM)
    server.bind(('0.0.0.0', 2830))

    server.listen(5)

    ##-->monitor_thread = threading.Thread(target=monitor)
    ##-->monitor_thread.setDaemon(True)
    ##-->monitor_thread.start()

    while exitFlag == False:
        (sock, address) = server.accept()
        if exitFlag == True:
            #sock.send("q\n")
            #sock.shutdown(socket.SHUT_RDWR)
            #sock.close()
            server.close()
            break

        thread = Worker(sock)
        thread.setDaemon(1)
        thread.start()

    print 'exitting...'
Beispiel #3
0
    def Start(self, url):
        url = url.replace("https", "wss")
        url = url.replace("http", "ws")
        url = url + "/socket.io/?EIO=3&transport=websocket"

        def Run():
            # websocket.enableTrace(True)
            ws = websocket.WebSocketApp(url,
                                        on_open=self.on_open,
                                        on_message=self.on_message,
                                        on_error=self.on_error,
                                        on_close=self.on_close)
            self.ws = ws
            if config.HttpProxy:
                data = config.HttpProxy.split(":")
                if len(data) == 3:
                    port = data[2]
                    host = data[1].replace("//", "")
                else:
                    host = data
                    port = 80

                ws.run_forever(http_proxy_host=host, http_proxy_port=port)
            else:
                ws.run_forever()

        thread = threading.Thread(target=Run)
        thread.setDaemon(True)
        thread.start()
Beispiel #4
0
def calculateAverage(period, classname):
    now = datetime.datetime.utcnow().replace(tzinfo=utc)
    round_now = now - datetime.timedelta(seconds=now.second, microseconds=now.microsecond)
    for server in Server.objects.all().select_related():
        try:
            threads = []
            for probe in server.probes.exclude(graph_type__name__in=['text']):
                thread = threading.Thread(target=calculateAveragesForPeriod, args=[period, classname, server, probe], name="SkwisshAverage.%s.%s" % (classname.__name__, probe.display_name.encode('utf-8').replace(" ", "_")))
                thread.setDaemon(False)
                thread.start()
                threads.append(thread)

            for thread in threads:
                thread.join()

            end = datetime.datetime.utcnow().replace(tzinfo=utc)
            total_time = end - now
            duration = float(int((total_time.seconds * 1000000) + total_time.microseconds) / 1000000.0)
            success = True
            message = "Calculated averages values for last %d minutes (server %s)" % (period, server.hostname)
        except:
            success = False
            message = traceback.format_exc()

        CronLog.objects.create(timestamp=round_now, action="average %dmin" % period, server=server, success=success, duration=duration, message=message)
Beispiel #5
0
def main():

        logger = logging.getLogger('default')
        logger.setLevel(logging.CRITICAL)

        tracks = getTrackDocuments()
        print 'Found %d tracks with no POIs' % len(tracks)

        workQueue = Queue.Queue()
        resultQueue = Queue.Queue()

        for track in tracks:
                workQueue.put(track)

        threads = []
        for i in range(NUM_THREAD_WORKER):
                thread = SparqlThread(time.time(), len(tracks), workQueue, resultQueue)
                thread.setDaemon(True)
                threads.append(thread)
                thread.start()

        writer = WriterThread(resultQueue, threads)
        writer.setDaemon(True)
        writer.start()

        while len(threads) > 0:
                try:
                        threads = [t.join(1000) for t in threads if t is not None and t.isAlive]

                except KeyboardInterrupt:
                        print 'Abort'
                        break
    def start(self, *args, **kwargs):
        """Start the task.

        This is:
            * not threadsave
            * assumed to be called in the gtk mainloop
        """
        args = (self.counter,) + args
        thread = threading.Thread(
                target=self._work_callback,
                args=args, kwargs=kwargs
                )
        thread.setDaemon(self.daemon)
        thread.start()
Beispiel #7
0
def main():
    if init() == False:
        return

    binlog_handler_manage = BinlogHandlerManage(global_binlog_data_manage)
    binlog_handler1 = BinlogHandler("test_handler")
    binlog_handler_manage.add_last_handler(binlog_handler1)
    thread = HandlerThread(global_signal, global_binlog_data_manage,
                           binlog_handler_manage)

    thread.setDaemon(True)
    thread.start()

    start_sync()
Beispiel #8
0
	def analyze(self, queryStringList):
		if type(queryStringList) != list:
			queryStringList = [queryStringList]
		try:
			for item in queryStringList:
				self.queue.put(item)
			
			for i in range(self.numberOfThreads):
				thread = ThreadHelper(self.queue, self.report, self.lock)
				thread.setDaemon(True)
				thread.start()

			self.queue.join()
		except Exception, e:
			logging.error(e)
Beispiel #9
0
    def go(self):
        self.timeLog("日志启动于 %s" % self.getStartRunningTime().strftime(self.TimeFormatForLog))
        self.timeLog("开始cancel pending orders")
        self.huobi_cancel_pending_orders()
        self.timeLog("完成cancel pending orders")

        thread_pool = []
        thread_pool.append(Thread(target=self.trade_thread, args=()))
        if self.need_rebalance:
            spot_rebalance = SpotRebalance(self.heart_beat_time, self.coinMarketType, depth_data=self.depth_data,
                                           transaction_info=self.order_info_queue)
            thread_pool.append(Thread(target=spot_rebalance.go, args=()))
        for thread in thread_pool:
            thread.setDaemon(True)
            thread.start()
        for thread in thread_pool:
            thread.join()
Beispiel #10
0
def createDemo3():
    threads = []
    for i in range(10):
        threads.append(threading.Thread(target=run_thread, name="thread-" + str(i), args=(15,)))

    for thread in threads:
        thread.setDaemon(True)
        thread.start()
        # print "threadName:%s" % thread.getName()

    # 主线程等待所有子线程结束
    for childThread in threads:
        # threading.Thread.join(childThread)
        childThread.join()

    for i in range(10):
        print "\nhahah%s" % i
    def Start(self, url):
        url = url.replace("https", "wss")
        url = url.replace("http", "ws")
        url = url + "/socket.io/?EIO=3&transport=websocket"

        def Run():
            # websocket.enableTrace(True)
            ws = websocket.WebSocketApp(url,
                                        on_open=self.on_open,
                                        on_message=self.on_message,
                                        on_error=self.on_error,
                                        on_close=self.on_close)
            self.ws = ws
            ws.run_forever()

        thread = threading.Thread(target=Run)
        thread.setDaemon(True)
        thread.start()
Beispiel #12
0
    def _start_queue_thread_get_link_file(self):

        while self._queue_get_link:
            args = tuple(self._queue_get_link.popleft(), )
            try:
                while threading.activeCount() >= MAX_THREAD:
                    activeThreads = threading.activeCount()
                    print activeThreads
                    time.sleep(activeThreads / 10)

                thread = threading.Thread(target=self._thread_get_link_file,
                                          args=args)
                thread.setDaemon(True)
                thread.start()
                # thread.join()
                time.sleep(0.1)
            except Exception as e:
                print e

        self._thread_get_link = None  #done
Beispiel #13
0
def mqttConnect():
  global args
  global mqttc
  try:
    mqttc = mosquitto.Mosquitto("adsbclient-%d" % (random.randint(0, 65535)))
    mqttc.on_message = mqttOnMessage
    mqttc.on_connect = mqttOnConnect
    mqttc.on_disconnect = mqttOnDisconnect
    mqttc.on_publish = mqttOnPublish
    mqttc.on_subscribe = mqttOnSubscribe

    if args.mqtt_user and args.mqtt_password:
      mqttc.username_pw_set(args.mqtt_user, password = args.mqtt_password)

    mqttc.connect(args.mqtt_host, args.mqtt_port, 60)

    thread = Thread(target = mqttThread)
    thread.setDaemon(True)
    thread.start()
    return True
  except socket.error, e:
    return False
Beispiel #14
0
def threaded_get(url=None, urls=None, num_threads=10, cb=None, post=False, depth=False, **kwargs):
    """Download these urls in parallel

    `url[s]' are the webpages to download
    `num_threads' determines the number of threads to download urls with
    `cb' is called after each download with the HTML of the download   
        the arguments are the url and downloaded html
        whatever URLs are returned are added to the crawl queue
    `post' is whether to use POST instead of default GET
    `depth' sets to traverse depth first rather than the default breadth first
    """
    cache = kwargs.pop('cache', None)
    if cache:
        common.logger.debug('Making a copy of the cache for each thread')
        
    class DownloadThread(threading.Thread):
        """Download data
        """
        processing = collections.deque()

        def __init__(self):
            threading.Thread.__init__(self)

        def run(self):
            new_cache = None
            if cache:
                new_cache = copy.copy(cache)
            D = Download(cache=new_cache, **kwargs)
            while urls or DownloadThread.processing:
                # keep track that are processing url
                DownloadThread.processing.append(1) 
                try:
                    if depth:
                        url = urls.popleft()
                    else:
                        url = urls.pop()
                except IndexError:
                    # currently no urls to process
                    DownloadThread.processing.popleft()
                    # so check again later
                    time.sleep(SLEEP_TIME)
                else:
                    # download this url
                    try:
                        html = (D.post if post else D.get)(url, **kwargs)
                        if cb:
                            # use callback to process downloaded HTML
                            urls.extend(cb(D, url, html) or [])
                    finally:
                        # have finished processing
                        # make sure this is called even on exception
                        DownloadThread.processing.popleft()

    # put urls into thread safe queue
    urls = urls or []
    if url: urls.append(url)
    urls = collections.deque(urls)
    threads = [DownloadThread() for i in range(num_threads)]
    for thread in threads:
        thread.setDaemon(True) # set daemon so can exit with ctrl-c
        thread.start()
    # wait for threads to finish
    while threading.active_count() > 1:
        time.sleep(SLEEP_TIME)
Beispiel #15
0
def threaded_get(url=None,
                 urls=None,
                 num_threads=10,
                 cb=None,
                 post=False,
                 depth=False,
                 **kwargs):
    """Download these urls in parallel

    `url[s]' are the webpages to download
    `num_threads' determines the number of threads to download urls with
    `cb' is called after each download with the HTML of the download   
        the arguments are the url and downloaded html
        whatever URLs are returned are added to the crawl queue
    `post' is whether to use POST instead of default GET
    `depth' sets to traverse depth first rather than the default breadth first
    """
    cache = kwargs.pop('cache', None)
    if cache:
        common.logger.debug('Making a copy of the cache for each thread')

    class DownloadThread(threading.Thread):
        """Download data
        """
        processing = collections.deque()

        def __init__(self):
            threading.Thread.__init__(self)

        def run(self):
            new_cache = None
            if cache:
                new_cache = copy.copy(cache)
            D = Download(cache=new_cache, **kwargs)
            while urls or DownloadThread.processing:
                # keep track that are processing url
                DownloadThread.processing.append(1)
                try:
                    if depth:
                        url = urls.popleft()
                    else:
                        url = urls.pop()
                except IndexError:
                    # currently no urls to process
                    DownloadThread.processing.popleft()
                    # so check again later
                    time.sleep(SLEEP_TIME)
                else:
                    # download this url
                    try:
                        html = (D.post if post else D.get)(url, **kwargs)
                        if cb:
                            # use callback to process downloaded HTML
                            urls.extend(cb(D, url, html) or [])
                    finally:
                        # have finished processing
                        # make sure this is called even on exception
                        DownloadThread.processing.popleft()

    # put urls into thread safe queue
    urls = urls or []
    if url: urls.append(url)
    urls = collections.deque(urls)
    threads = [DownloadThread() for i in range(num_threads)]
    for thread in threads:
        thread.setDaemon(True)  # set daemon so can exit with ctrl-c
        thread.start()
    # wait for threads to finish
    while threading.active_count() > 1:
        time.sleep(SLEEP_TIME)
Beispiel #16
0
 def main_loop(self):
     thread = threading.Thread(target=self._actual_main_loop)
     thread.setDaemon(
         True)  # Main program should not stop quitting if only this is left
     thread.start()
 def main_loop(self):
     thread = threading.Thread(target=self._actual_main_loop)
     thread.setDaemon(True) # Main program should not stop quitting if only this is left
     thread.start()
Beispiel #18
0
def adsbConnect():
    thread = Thread(target=adsbThread)
    thread.setDaemon(True)
    thread.start()
Beispiel #19
0
 def start (self): 
     thread = Thread (target = self.run)
     # Consumer can choose whether to wait for threads to complete:
     thread.setDaemon (True)   
     thread.start()
Beispiel #20
0
def threaded_get(url=None,
                 urls=None,
                 url_iter=None,
                 num_threads=10,
                 dl=None,
                 cb=None,
                 depth=True,
                 **kwargs):
    """Download these urls in parallel

    url:
        the webpage to download
    urls:
        the webpages to download
    num_threads:
        the number of threads to download urls with
    cb:
        Called after each download with the HTML of the download. 
        The arguments are the url and downloaded html.
        Whatever URLs are returned are added to the crawl queue.
    dl:
        A callback for customizing the download.
        Takes the download object and url and should return the HTML.
    depth:
        True for depth first search
    """
    running = True
    lock = threading.Lock()

    def add_iter_urls():
        if lock.acquire(False):
            for url in url_iter or []:
                download_queue.append(url)
                break
            lock.release()

    def process_queue():
        """Thread for downloading webpages
        """
        D = Download(**kwargs)

        while True:
            try:
                url = download_queue.pop(
                ) if depth else download_queue.popleft()

            except IndexError:
                add_iter_urls()
                break

            else:
                # download this url
                html = dl(D, url, **kwargs) if dl else D.get(url, **kwargs)
                if cb:
                    try:
                        # use callback to process downloaded HTML
                        result = cb(D, url, html)

                    except StopCrawl:
                        common.logger.info('Stopping crawl signal')
                        self.running = False

                    except Exception:
                        # catch any callback error to avoid losing thread
                        common.logger.exception('\nIn callback for: ' +
                                                str(url))

                    else:
                        # add these URL's to crawl queue
                        for link in result or []:
                            download_queue.append(link)

                # update the crawler state
                # no download or error so must have read from cache
                num_caches = 0 if D.num_downloads or D.num_errors else 1
                state.update(num_downloads=D.num_downloads,
                             num_errors=D.num_errors,
                             num_caches=num_caches,
                             queue_size=len(download_queue))

    download_queue = collections.deque()
    if urls:
        download_queue.extend(urls)
    if url:
        download_queue.append(url)
    common.logger.debug('Start new crawl')

    # initiate the state file with the number of URL's already in the queue
    state = State()
    state.update(queue_size=len(download_queue))

    # wait for all download threads to finish
    threads = []
    while running and (threads or download_queue):
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < num_threads and download_queue:
            # cat start more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(
                True
            )  # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
    # save the final state after threads finish
    state.save()
Beispiel #21
0
  def run_async(self, sockopt=None, sslopt=None,
                ping_interval=0, ping_timeout=None,
                http_proxy_host=None, http_proxy_port=None,
                http_no_proxy=None, http_proxy_auth=None,
                skip_utf8_validation=False,
                host=None, origin=None):
      """
      run event loop for WebSocket framework.
      This loop is infinite loop and is alive during websocket is available.
      sockopt: values for socket.setsockopt.
          sockopt must be tuple
          and each element is argument of sock.setsockopt.
      sslopt: ssl socket optional dict.
      ping_interval: automatically send "ping" command
          every specified period(second)
          if set to 0, not send automatically.
      ping_timeout: timeout(second) if the pong message is not received.
      http_proxy_host: http proxy host name.
      http_proxy_port: http proxy port. If not set, set to 80.
      http_no_proxy: host names, which doesn't use proxy.
      skip_utf8_validation: skip utf8 validation.
      host: update host header.
      origin: update origin header.
      """

      if not ping_timeout or ping_timeout <= 0:
          ping_timeout = None
      if ping_timeout and ping_interval and ping_interval <= ping_timeout:
          raise WebSocketException("Ensure ping_interval > ping_timeout")
      if sockopt is None:
          sockopt = []
      if sslopt is None:
          sslopt = {}
      if self.sock:
          raise WebSocketException("socket is already opened")
      thread = None
      close_frame = None

      try:
          logging.debug("Starting")
          self.sock = websocket.WebSocket(
              self.get_mask_key, sockopt=sockopt, sslopt=sslopt,
              fire_cont_frame=self.on_cont_message and True or False,
              skip_utf8_validation=skip_utf8_validation)
          logging.debug("Created socket")
          self.sock.settimeout(websocket.getdefaulttimeout())
          logging.debug("Connecting")
          self.sock.connect(
              self.url, header=self.header, cookie=self.cookie,
              http_proxy_host=http_proxy_host,
              http_proxy_port=http_proxy_port, http_no_proxy=http_no_proxy,
              http_proxy_auth=http_proxy_auth, subprotocols=self.subprotocols,
              host=host, origin=origin)
          logging.debug("Calling on open")
          self._callback(self.on_open)

          if ping_interval:
              event = threading.Event()
              thread = threading.Thread(
                  target=self._send_ping, args=(ping_interval, event))
              thread.setDaemon(True)
              thread.start()

          logging.debug("Entering mainloop")
          while self.sock.connected:
              logging.debug("Background websocket client calling select")
              r, w, e = select.select(
                  (self.sock.sock, self.dispatch_queue.read_pipe), (), (), ping_timeout)
              if not self.keep_running:
                  break

              if r and self.dispatch_queue.read_pipe in r:
                logging.debug("Background websocket client running queued jobs")
                n = self.dispatch_queue.run_jobs()

              if r and self.sock.sock in r:
                  op_code, frame = self.sock.recv_data_frame(True)
                  if op_code == websocket.ABNF.OPCODE_CLOSE:
                      close_frame = frame
                      break
                  elif op_code == websocket.ABNF.OPCODE_PING:
                      self._callback(self.on_ping, frame.data)
                  elif op_code == websocket.ABNF.OPCODE_PONG:
                      self.last_pong_tm = time.time()
                      self._callback(self.on_pong, frame.data)
                  elif op_code == websocket.ABNF.OPCODE_CONT and self.on_cont_message:
                      self._callback(self.on_data, data,
                                     frame.opcode, frame.fin)
                      self._callback(self.on_cont_message,
                                     frame.data, frame.fin)
                  else:
                      data = frame.data
                      if six.PY3 and op_code == websocket.ABNF.OPCODE_TEXT:
                          data = data.decode("utf-8")
                      self._callback(self.on_data, data, frame.opcode, True)
                      self._callback(self.on_message, data)

              if ping_timeout and self.last_ping_tm \
                      and time.time() - self.last_ping_tm > ping_timeout \
                      and self.last_ping_tm - self.last_pong_tm > ping_timeout:
                  raise WebSocketTimeoutException("ping/pong timed out")
          logging.debug("While loop exited")
      except (Exception, KeyboardInterrupt, SystemExit) as e:
          traceback.print_exc()
          self._callback(self.on_error, e)
          if isinstance(e, SystemExit):
              # propagate SystemExit further
              raise
      finally:
          logging.debug("Everything has gone to shit")
          if thread and thread.isAlive():
              event.set()
              thread.join()
              self.keep_running = False
          if self.sock is not None:
              self.sock.close()
          close_args = self._get_close_args(
              close_frame.data if close_frame else None)
          self._callback(self.on_close, *close_args)
          self.sock = None
def threaded_get(url=None, urls=None, url_iter=None, num_threads=10, dl=None, cb=None, depth=True, **kwargs):
    """Download these urls in parallel

    url:
        the webpage to download
    urls:
        the webpages to download
    num_threads:
        the number of threads to download urls with
    cb:
        Called after each download with the HTML of the download. 
        The arguments are the url and downloaded html.
        Whatever URLs are returned are added to the crawl queue.
    dl:
        A callback for customizing the download.
        Takes the download object and url and should return the HTML.
    depth:
        True for depth first search
    """
    running = True
    lock = threading.Lock()
    def add_iter_urls():
        if lock.acquire(False):
            for url in url_iter or []:
                download_queue.append(url)
                break
            lock.release()


    def process_queue():
        """Thread for downloading webpages
        """
        D = Download(**kwargs)

        while True:
            try:
                url = download_queue.pop() if depth else download_queue.popleft()

            except IndexError:
                add_iter_urls()
                break

            else:
                # download this url
                html = dl(D, url, **kwargs) if dl else D.get(url, **kwargs)
                if cb:
                    try:
                        # use callback to process downloaded HTML
                        result = cb(D, url, html)

                    except StopCrawl:
                        common.logger.info('Stopping crawl signal')
                        running = False

                    except Exception:
                        # catch any callback error to avoid losing thread
                        common.logger.exception('\nIn callback for: ' + str(url))

                    else:
                        # add these URL's to crawl queue
                        for link in result or []:
                            download_queue.append(urlparse.urljoin(url, link))
                                        
                # update the crawler state
                # no download or error so must have read from cache
                num_caches = 0 if D.num_downloads or D.num_errors else 1
                state.update(num_downloads=D.num_downloads, num_errors=D.num_errors, num_caches=num_caches, queue_size=len(download_queue))

    download_queue = collections.deque()
    if urls:
        download_queue.extend(urls)
    if url:
        download_queue.append(url)
    common.logger.debug('Start new crawl')

    # initiate the state file with the number of URL's already in the queue
    state = State()
    state.update(queue_size=len(download_queue))

    # wait for all download threads to finish
    threads = []
    while running and (threads or download_queue):
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < num_threads and download_queue:
            # cat start more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)
    # save the final state after threads finish
    state.save()
Beispiel #23
0
def setDaemon(thread):
    # Reference: http://stackoverflow.com/questions/190010/daemon-threads-explanation
    if PYVERSION >= "2.6":
        thread.daemon = True
    else:
        thread.setDaemon(True)
Beispiel #24
0
def adsbConnect():
    thread = Thread(target = adsbThread)
    thread.setDaemon(True)
    thread.start()
Beispiel #25
0
def threaded_get(url=None, urls=None, num_threads=10, dl=None, cb=None, depth=None, wait_finish=True, reuse_queue=False, max_queue=1000, **kwargs):
    """Download these urls in parallel

    url:
        the webpage to download
    urls:
        the webpages to download
    num_threads:
        the number of threads to download urls with
    cb:
        Called after each download with the HTML of the download. 
        The arguments are the url and downloaded html.
        Whatever URLs are returned are added to the crawl queue.
    dl:
        A callback for customizing the download.
        Takes the download object and url and should return the HTML.
    depth:
        Deprecated - will be removed in later version
    wait_finish:
        whether to wait until all download threads have finished before returning
    reuse_queue:
        Whether to continue the queue from the previous run.
    max_queue:
        The maximum number of queued URLs to keep in memory.
        The rest will be in the cache.
    """
    if kwargs.pop('cache', None):
        common.logger.debug('threaded_get does not support cache flag')
    lock = threading.Lock()


    class DownloadThread(threading.Thread):
        """Thread for downloading webpages
        """
        processing = collections.deque() # to track whether are still downloading
        discovered = {} # the URL's that have been discovered

        def __init__(self):
            threading.Thread.__init__(self)

        def run(self):
            D = Download(**kwargs)
            queue = pdict.Queue(settings.queue_file)

            while seed_urls or DownloadThread.processing:
                # keep track that are processing url
                DownloadThread.processing.append(1) 
                try:
                    url = seed_urls.pop()

                except IndexError:
                    # currently no urls to process
                    DownloadThread.processing.popleft()
                    # so check again later
                    time.sleep(SLEEP_TIME)

                else:
                    try:
                        # download this url
                        html = dl(D, url, **kwargs) if dl else D.get(url, **kwargs)
                        if cb:
                            try:
                                # use callback to process downloaded HTML
                                result = cb(D, url, html)

                            except Exception, e:
                                # catch any callback error to avoid losing thread
                                common.logger.exception('\nIn callback for: ' + str(url))

                            else:
                                # add these URL's to crawl queue
                                for link in result or []:
                                    cb_url = urlparse.urljoin(url, link)
                                    if isinstance(result, dict):
                                        DownloadThread.discovered[cb_url] = result[link]
                                    else:
                                        DownloadThread.discovered[cb_url] = DEFAULT_PRIORITY
                                            
                                if len(seed_urls) < max_queue:
                                    # need to request more queue
                                    if DownloadThread.discovered or len(queue) > 0:
                                        # there are outstanding in the queue
                                        if lock.acquire(False):
                                            # no other thread is downloading
                                            common.logger.debug('Loading from queue: %d' % len(seed_urls))
                                            discovered = []
                                            while DownloadThread.discovered:
                                                discovered.append(DownloadThread.discovered.popitem())
                                            queue.push(discovered)
                                            # get next batch of URLs from cache
                                            seed_urls.extend(queue.pull(limit=max_queue))
                                            lock.release()
                    finally:
                        # have finished processing
                        # make sure this is called even on exception to avoid eternal loop
                        DownloadThread.processing.pop()
                    # update the crawler state
                    # no download or error so must have read from cache
                    num_caches = 0 if D.num_downloads or D.num_errors else 1
                    state.update(num_downloads=D.num_downloads, num_errors=D.num_errors, num_caches=num_caches, queue_size=len(queue))


    queue = pdict.Queue(settings.queue_file)
    if reuse_queue:
        # command line flag to enable queue
        queued_urls = queue.pull(limit=max_queue)
    else:
        queued_urls = []
    if queued_urls:
        # continue the previous crawl
        seed_urls = collections.deque(queued_urls)
        common.logger.debug('Loading crawl queue')
    else:
        # remove any queued URL's so can crawl again
        queue.clear()
        urls = urls or []
        if url:
            urls.append(url)
        queue.push([(url, DEFAULT_PRIORITY) for url in urls])
        # put urls into thread safe queue
        seed_urls = collections.deque(queue.pull(limit=max_queue))
        common.logger.debug('Start new crawl')

    # initiate the state file with the number of URL's already in the queue
    state = State()
    state.update(queue_size=len(queue))

    # start the download threads
    threads = [DownloadThread() for i in range(num_threads)]
    for thread in threads:
        thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c
        thread.start()

    # Wait for all download threads to finish
    while threads and wait_finish:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        time.sleep(SLEEP_TIME)
    # save the final state after threads finish
    state.save()
        total.append(line)
        threadLimiter.release()


# Copy inaccessible hosts into this
down_hosts = []
try:
    with open(argv[2]) as down:
        down_hosts_raw = down.readlines()
        down_hosts = [line[2:] for line in down_hosts_raw]
        for l in down_hosts:
            line = l[:-1]
            threadLimiter.acquire()
            signal.signal(signal.SIGINT, signal_handler)
            thread = threading.Thread(target=run, args=(line, ))
            thread.setDaemon(True)
            thread.start()
except ExitCommand:
    pass
finally:
    with open('failed.txt', 'w') as failed_hosts_file:
        for host in failed:
            failed_hosts_file.write("- {}\n".format(host))
    with open('success.txt', 'w') as successful_hosts_file:
        for host in success:
            successful_hosts_file.write("- {}\n".format(host))
    with open('processed_hosts.txt', 'w') as processed_hosts_file:
        for host in processed:
            processed_hosts_file.write("- {}\n".format(host))
    logging.warning("The following hosts failed to reauthenticate! {}".format(
        failed, ))
Beispiel #27
0
def threaded_get(url=None,
                 urls=None,
                 num_threads=10,
                 dl=None,
                 cb=None,
                 depth=None,
                 wait_finish=True,
                 reuse_queue=False,
                 max_queue=1000,
                 **kwargs):
    """Download these urls in parallel

    url:
        the webpage to download
    urls:
        the webpages to download
    num_threads:
        the number of threads to download urls with
    cb:
        Called after each download with the HTML of the download. 
        The arguments are the url and downloaded html.
        Whatever URLs are returned are added to the crawl queue.
    dl:
        A callback for customizing the download.
        Takes the download object and url and should return the HTML.
    depth:
        Deprecated - will be removed in later version
    wait_finish:
        whether to wait until all download threads have finished before returning
    use_queue:
        Whether to continue the queue from the previous run.
    max_queue:
        The maximum number of queued URLs to keep in memory.
        The rest will be in the cache.
    """
    if kwargs.pop('cache', None):
        common.logger.debug('threaded_get does not support cache flag')
    lock = threading.Lock()

    class DownloadThread(threading.Thread):
        """Thread for downloading webpages
        """
        processing = collections.deque(
        )  # to track whether are still downloading
        discovered = {}  # the URL's that have been discovered

        def __init__(self):
            threading.Thread.__init__(self)

        def run(self):
            D = Download(**kwargs)
            queue = pdict.Queue(settings.queue_file)

            while seed_urls or DownloadThread.processing:
                # keep track that are processing url
                DownloadThread.processing.append(1)
                try:
                    url = seed_urls.pop()

                except IndexError:
                    # currently no urls to processa
                    DownloadThread.processing.popleft()
                    # so check again later
                    time.sleep(SLEEP_TIME)

                else:
                    try:
                        # download this url
                        html = dl(D, url, **kwargs) if dl else D.get(
                            url, **kwargs)
                        if cb:
                            try:
                                # use callback to process downloaded HTML
                                cb_urls = cb(D, url, html)

                            except Exception, e:
                                # catch any callback error to avoid losing thread
                                common.logger.error('in callback for: ' +
                                                    str(url) + '\n' +
                                                    traceback.format_exc())

                            else:
                                # add these URL's to crawl queue
                                for cb_url in cb_urls or []:
                                    if isinstance(cb_urls, dict):
                                        DownloadThread.discovered[
                                            cb_url] = cb_urls[cb_url]
                                    else:
                                        DownloadThread.discovered[
                                            cb_url] = DEFAULT_PRIORITY

                                if len(seed_urls) < max_queue:
                                    # need to request more queue
                                    if DownloadThread.discovered or len(
                                            queue) > 0:
                                        # there are outstanding in the queue
                                        if lock.acquire(False):
                                            # no other thread is downloading
                                            common.logger.debug(
                                                'Loading from queue: %d' %
                                                len(seed_urls))
                                            discovered = []
                                            while DownloadThread.discovered:
                                                discovered.append(
                                                    DownloadThread.discovered.
                                                    popitem())
                                            queue.push(discovered)
                                            # get next batch of URLs from cache
                                            seed_urls.extend(
                                                queue.pull(limit=max_queue))
                                            lock.release()
                                """
                                for cb_url in cb_urls or []:
                                    if cb_url not in DownloadThread.discovered:
                                        DownloadThread.discovered[cb_url] = 1
                                        seed_urls.append(cb_url)
                                """
                    finally:
                        # have finished processing
                        # make sure this is called even on exception to avoid eternal loop
                        DownloadThread.processing.pop()
                    # update the crawler state
                    # no download or error so must have read from cache
                    num_caches = 0 if D.num_downloads or D.num_errors else 1
                    state.update(num_downloads=D.num_downloads,
                                 num_errors=D.num_errors,
                                 num_caches=num_caches,
                                 queue_size=len(queue))

    queue = pdict.Queue(settings.queue_file)
    if reuse_queue:
        # command line flag to enable queue
        queued_urls = queue.pull(limit=max_queue)
    else:
        queued_urls = []
    if queued_urls:
        # continue the previous crawl
        seed_urls = collections.deque(queued_urls)
        common.logger.debug('Loading crawl queue')
    else:
        # remove any queued URL's so can crawl again
        queue.clear()
        urls = urls or []
        if url:
            urls.append(url)
        queue.push([(url, DEFAULT_PRIORITY) for url in urls])
        # put urls into thread safe queue
        seed_urls = collections.deque(queue.pull(limit=max_queue))
        common.logger.debug('Start new crawl')

    # initiate the state file with the number of URL's already in the queue
    state = State()
    state.update(queue_size=len(queue))

    # start the download threads
    threads = [DownloadThread() for i in range(num_threads)]
    for thread in threads:
        thread.setDaemon(
            True)  # set daemon so main thread can exit when receives ctrl-c
        thread.start()

    # Wait for all download threads to finish
    while threads and wait_finish:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        time.sleep(SLEEP_TIME)
    # save the final state after threads finish
    state.save()
Beispiel #28
0
def setDaemon(thread):
    # Reference: http://stackoverflow.com/questions/190010/daemon-threads-explanation
    if PYVERSION >= "2.6":
        thread.daemon = True
    else:
        thread.setDaemon(True)