def spider(): """ Create the httpclient pool which default size is 1200 """ httpclient_pool = HttpClientPool() """ Allocate a task to this spider node, skip is the task's start point in queue, and limit is the number of data which will be snatched in this spider node. """ queue_size = queue.count() limit = queue_size / spider_total_number skip = limit * spider_serial_number if spider_total_number - 1 == spider_serial_number: limit += queue_size % spider_total_number print "skip = ", skip, ", limit = ", limit with gevent.Timeout(None, False): print "This spider is start." orders = queue.find(skip, limit) while orders.count() > 0: for order in orders: thead_pool.spawn(httpclient_pool.request, order) thead_pool.join() print "Start =", httpclient.start,", End =", httpclient.end,", Error =", httpclient.error httpclient.start = 0 httpclient.end = 0 httpclient.error = 0 orders = queue.find(skip, limit) print "This spider is finished."
def snatch(): global running """ If this node is running, pass """ if running == True: return """ Create the httpclient pool which default size is 10 """ httpclient_pool = HttpClientPool() """ Allocate a task to this spider node, skip is the task's start point in queue, and limit is the number of data which will be snatched in this spider node. """ queue_size = queue.count() limit = queue_size / node_total_number skip = limit * node_serial_number if node_total_number - 1 == node_serial_number: limit += queue_size % node_total_number print "skip = ", skip, ", limit = ", limit with gevent.Timeout(None, False): print "This spider is start." running = True orders = queue.find(skip, limit) #while orders.count() > 0: for order in orders: thread_pool.spawn(httpclient_pool.request, order) thread_pool.join() print "Start =", httpclientpool.start,", End =", httpclientpool.end,", Error =", httpclientpool.error httpclientpool.start = 0 httpclientpool.end = 0 httpclientpool.error = 0 orders = queue.find(skip, limit) running = False print "This spider is finished."