def checkTimeOutPut(args): t = None global currCommandProcess global stde global stdo stde = None stdo = None def executeCommand(): global currCommandProcess global stdo global stde try: stdo, stde = currCommandProcess.communicate() printLog('stdout:\n'+str(stdo)) printLog('stderr:\n'+str(stde)) except: printLog("ERROR: UNKNOWN Exception - +checkWinTimeOutPut()::executeCommand()") currCommandProcess = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE,shell=True) thread = Thread(target=executeCommand) thread.start() thread.join(TIMOUT_VAL) #wait for the thread to complete if thread.is_alive(): printLog('ERROR: Killing the process - terminating thread because it is taking too much of time to execute') currCommandProcess.kill() printLog('ERROR: Timed out exception') raise errorHandler.ApplicationException(__file__, errorHandler.TIME_OUT) if stdo == "" or stdo==None: errCode = currCommandProcess.poll() printLog('ERROR: @@@@@Raising Called processor exception') raise subprocess.CalledProcessError(errCode, args, output=stde) return stdo
def checkTimeOutPut(args): t = None global currCommandProcess global stde global stdo stde = None stdo = None def executeCommand(): global currCommandProcess global stdo global stde try: stdo, stde = currCommandProcess.communicate() printLog('stdout:\n'+str(stdo)) printLog('stderr:\n'+str(stde)) except: printLog("ERROR: UNKNOWN Exception - +checkWinTimeOutPut()::executeCommand()") currCommandProcess = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) thread = Thread(target=executeCommand) thread.start() thread.join(TIMOUT_VAL) #wait for the thread to complete if thread.is_alive(): printLog('ERROR: Killing the process - terminating thread because it is taking too much of time to execute') currCommandProcess.kill() printLog('ERROR: Timed out exception') raise errorHandler.ApplicationException(__file__, errorHandler.TIME_OUT) if stdo == "" or stdo==None: errCode = currCommandProcess.poll() printLog('ERROR: @@@@@Raising Called processor exception') raise subprocess.CalledProcessError(errCode, args, output=stde) return stdo
def MoveToJointPositions(limb, moves, queue, write = True): try: for move in moves: thread = threading.Thread( target=move_thread, args=(limb,move, queue, write) ) if (move.values()): thread.daemon = True thread.start() baxter_dataflow.wait_for( lambda: not (thread.is_alive()), timeout=20.0, timeout_msg=("Timeout while waiting for %s move thread" " to finish" % limb.name), rate=10, ) thread.join() result = queue.get() if not result is None: raise queue.get() rospy.sleep(1.0) except Exception, exception: queue.put(traceback.format_exc()) queue.put(exception)
def do_GET(self): print 'do_GET: ', self.path self.path = self.path.split("?")[0] if self.path == "/": self.path = "/index.html" try: #Check the file extension required and #set the right mime type sendReply = False if self.path.endswith(".html"): mimetype = 'text/html' sendReply = True if self.path.endswith(".jpg"): mimetype = 'image/jpg' sendReply = True if self.path.endswith(".gif"): mimetype = 'image/gif' sendReply = True if self.path.endswith(".js"): mimetype = 'application/javascript' sendReply = True if self.path.endswith(".css"): mimetype = 'text/css' sendReply = True if sendReply == True: #Open the static file requested and send it print 'serve file:', curdir + sep + self.path f = open(curdir + sep + self.path) self.send_response(200) self.send_header('Content-type', mimetype) self.end_headers() self.wfile.write(f.read()) f.close() #print 'Create new thread..' print threading.active_count() thread = None #if not thread or not thread.is_alive(): if threading.active_count() <= 1: print 'start new...' thread = Thread(target=self.capture_img) thread.start() else: print 'running...' return except IOError: self.send_error(404, 'File Not Found: %s' % self.path) thread = None if not thread or not thread.is_alive(): print 'start new...' thread = Thread(target=self.capture_img) thread.start() else: print 'running...'
def __is_task_done(self, threads): finishThreadNum = 0 for thread in threads: if not thread.is_alive(): finishThreadNum += 1 print "finish " + str(finishThreadNum) if finishThreadNum == len(threads): return True else: return False
def run(self, timeout): print "running " + self.cmd def target(): self.process = subprocess.Popen(self.cmd, shell=True) self.process.communicate() thread = threading.Thread(target=target) thread.start() thread.join(timeout) if thread.is_alive(): print 'Terminating process' self.process.terminate() thread.join()
def run(self, timeout=0): def target(): print 'Thread started' self.process = subprocess.Popen(self.cmd, shell=True) self.process.communicate() print 'Thread finished' thread = threading.Thread(target=target) thread.start() if timeout == 0: return thread.join(timeout) if thread.is_alive(): print 'Terminating process' self.process.terminate() thread.join() print self.process.returncode
def start_scraping(threads_number): global config website_type = config.CLASS_TYPE_TROPICAIR global_sc_obj = Scraper( use_cache=False, #enable cache globally retries=3, use_default_logging = False ) tropicair_depart_arrival_list = [] try: with open(config.AIRPORT_RELATIONSHIP_FILE) as csvfile: reader = csv.reader(csvfile) for i, item in enumerate(reader): if i > 0 and item[0] != "" and item[1] != "": obj = {} obj["Departure"] = item[0] obj["Arrival"] = item[1] obj["Type"] = item[2] if obj["Type"] == config.CLASS_TYPE_TROPICAIR_STR: tropicair_depart_arrival_list.append(obj) except Exception as e: print (e) return sc_obj_list = [] for i in range(0, threads_number): sc_obj = Scraper( use_cache=False, #enable cache globally retries=3, timeout=60, use_default_logging = False ) sc_obj_list.append(sc_obj) tz = pytz.timezone('America/Los_Angeles') depart_arrival_list = tropicair_depart_arrival_list if len(depart_arrival_list) == 0: print ('None depart arrival info') return filename = "{}.csv".format(common_lib.get_webiste_str(website_type)) for i, depart_arrival_info in enumerate(depart_arrival_list): threads = [] currentdate = datetime.now(tz) print ("Current Date & Time: {} , {}".format(currentdate.strftime('%Y-%m-%d'), currentdate.strftime('%H:%M'))) departure = depart_arrival_info["Departure"] arrival = depart_arrival_info["Arrival"] departure_abbr = "" arrival_abbr = "" start_step = 0 departure_abbr = departure.split("-")[1].strip() arrival_abbr = arrival.split("-")[1].strip() # for step in range(start_step, start_step + config.DAYS_TO_BE_SCRAPED): # date_list.append({"date":datetime.now(tz) + timedelta(days=step), "status":"none", "error_count":0}) date_list = date_thread_list(threads_number) while len(date_list) > 0: if len(threads) < threads_number: start_date = None bStop = True for date in date_list: if date["status"] != "complete": bStop = False if date["status"] == "none": start_date = date start_date["status"] = "pending" break if bStop == True: break if start_date == None: continue print ("++++++++++++++++++++++++++++++") print ("Depart List = " + str(len(depart_arrival_list)) + " Index =" + str(i)) # print (depart_arrival_info) print (departure_abbr + "," + arrival_abbr) print ("++++++++++++++++++++++++++++++") sleep(config.DRIVER_SHORT_WAITING_SECONDS) proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy() if proxy_user != None: auth_str = "{}:{}".format(proxy_user, proxy_pass) proxy = Proxy(proxy_ip, proxy_port, auth_str) else: proxy = Proxy(proxy_ip, proxy_port) s = sc_obj_list[len(date_list) % threads_number] s.proxy_manager.session_proxy = proxy class_obj = TropicAir(s, start_date, departure, arrival, currentdate, tz, departure_abbr, arrival_abbr) thread_obj = threading.Thread(target=class_obj.parse_website, args=(config.DRIVER_VALUE_PHANTOMJS,)) # args=(config.DRIVER_VALUE_CHROME,)) threads.append(thread_obj) thread_obj.start() for thread in threads: if not thread.is_alive(): thread.join() threads.remove(thread) # filename = "{}_{}_{}_{}.csv".format(common_lib.get_webiste_str(website_type), departure_abbr, arrival_abbr, currentdate.strftime('%Y-%b-%d %H')) no_result = 0 for item in date_list: no_result += item["no_result"] stopdate = datetime.now(tz) print ("Finish Date & Time: {} , {}".format(stopdate.strftime('%Y-%m-%d'), stopdate.strftime('%H:%M'))) global_sc_obj.save([ "Departure", departure, "Arrival", arrival, "No Result", no_result, "File Name", filename, "Start", currentdate.strftime('%Y-%m-%d %H:%M'), "Finish", stopdate.strftime('%Y-%m-%d %H:%M') ], "output/output_{}.csv".format(website_type)) print ( "*************************") # break try: common_lib.upload_file(filename, "output/") print "Upload" except: print ( "Error while upload :" + filename)
def start_scraping(threads_number, website_type): global config global_sc_obj = Scraper( use_cache=False, #enable cache globally retries=3, ) logger = global_sc_obj.logger tropicair_depart_arrival_list = [] mayaislandair_depart_arrival_list = [] try: with open(config.AIRPORT_RELATIONSHIP_FILE) as csvfile: reader = csv.reader(csvfile) for i, item in enumerate(reader): if i > 0 and item[0] != "" and item[1] != "": obj = {} obj["Departure"] = item[0] obj["Arrival"] = item[1] obj["Type"] = item[2] if obj["Type"] == config.CLASS_TYPE_MAYAISLANDAIR_STR: mayaislandair_depart_arrival_list.append(obj) elif obj["Type"] == config.CLASS_TYPE_TROPICAIR_STR: tropicair_depart_arrival_list.append(obj) else: raise Exception("Invalid content in relatin csv file") except Exception as e: print(e) return sc_obj_list = [] for i in range(0, threads_number): sc_obj = Scraper( use_cache=False, #enable cache globally retries=3, timeout=300, #log_file='logs/{}_log_{}.txt'.format(website_type, i) ) sc_obj_list.append(sc_obj) tz = pytz.timezone('America/Los_Angeles') depart_arrival_list = [] if website_type == config.CLASS_TYPE_MAYAISLANDAIR: depart_arrival_list = mayaislandair_depart_arrival_list elif website_type == config.CLASS_TYPE_TROPICAIR: depart_arrival_list = tropicair_depart_arrival_list if len(depart_arrival_list) == 0: print('None depart arrival info') return #depart_arrival_list = [depart_arrival_list[0]] threads = [] for i, depart_arrival_info in enumerate(depart_arrival_list): currentdate = datetime.now(tz) print("Current Date & Time: {} , {}".format( currentdate.strftime('%Y-%m-%d'), currentdate.strftime('%H:%M'))) departure = depart_arrival_info["Departure"] arrival = depart_arrival_info["Arrival"] departure_abbr = "" arrival_abbr = "" start_step = 0 if website_type == config.CLASS_TYPE_MAYAISLANDAIR: departure_abbr = re.search("\((.*?)\)", departure, re.I | re.S | re.M).group(1).strip() arrival_abbr = re.search("\((.*?)\)", arrival, re.I | re.S | re.M).group(1).strip() start_step = 1 ## This website not scraping today data, so start with +1 elif website_type == config.CLASS_TYPE_TROPICAIR: departure_abbr = departure.split("-")[1].strip() arrival_abbr = arrival.split("-")[1].strip() date_list = [] no_result_info = {"Count": 0} for step in range(start_step, start_step + config.DAYS_TO_BE_SCRAPED): date_list.append({ "date": datetime.now(tz) + timedelta(days=step), "status": "none", "error_count": 0 }) start_date_str = "" while len(date_list) > 0: if len(threads) < threads_number: start_date = None if no_result_info["Count"] > config.MAX_NO_RESULT_COUNT: print("--------------------------") print("No result any more") print("--------------------------") break for date in date_list: if date["status"] == "complete": # print ("Remove Date") # print (date) date_list.remove(date) elif date["status"] == "none": start_date = date start_date["status"] = "pending" break if len(date_list) == 0: break if start_date == None: continue print("++++++++++++++++++++++++++++++") print("Depart List = " + str(len(depart_arrival_list)) + " Index =" + str(i)) # print (depart_arrival_info) print(departure_abbr + "," + arrival_abbr) print(start_date) print("++++++++++++++++++++++++++++++") start_date_str = start_date["date"].strftime('%Y-%m-%d') sleep(config.DRIVER_SHORT_WAITING_SECONDS) proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy() if proxy_user != None: auth_str = "{}:{}".format(proxy_user, proxy_pass) proxy = Proxy(proxy_ip, proxy_port, auth_str) else: proxy = Proxy(proxy_ip, proxy_port) s = sc_obj_list[len(date_list) % threads_number] s.proxy_manager.session_proxy = proxy class_obj = None if website_type == config.WEBSITE_TYPE_MAYAISLANDAIR: class_obj = MayaislandAir(s, start_date, departure, arrival, currentdate, tz, departure_abbr, arrival_abbr, no_result_info) else: class_obj = TropicAir(s, start_date, departure, arrival, currentdate, tz, departure_abbr, arrival_abbr, no_result_info) thread_obj = threading.Thread( target=class_obj.parse_website, args=(config.DRIVER_VALUE_PHANTOMJS, )) # args=(config.DRIVER_VALUE_CHROME,)) threads.append(thread_obj) thread_obj.start() for thread in threads: if not thread.is_alive(): thread.join() threads.remove(thread) print("*************************") print(len(date_list)) print(no_result_info) filename = "{}_{}_{}_{}.csv".format( common_lib.get_webiste_str(website_type), departure_abbr, arrival_abbr, currentdate.strftime('%Y-%b-%d %H')) try: #common_lib.upload_file(filename, "output/") print "Upload" except: print("Error while upload :" + filename) global_sc_obj.save([ "Departure", departure, "Arrival", arrival, "Date Len", len(date_list), "No Result", no_result_info["Count"], "File Name", filename, "Start Date", start_date_str ], "export_{}.csv".format(website_type)) print("*************************")
def poll(self): if not self.lock_poll.acquire( 0 ): print "Loop::poll() could not acquire lock" return try: # check for completed threads for thread in self.threads: if not thread: self.threads.remove( thread ) elif not thread.is_alive(): station = thread.get_station() if station: station._log( "Wrapping up thread (" + str(len(self.threads) - 1) + " threads remaining)" ) if action == 'check': self.record( station ) self.threads.remove( thread ) # check for failed threads while len(self.threads) < self.max_threads: # check for stations in original list if len(self.stations): station_info = self.stations.pop(0) # once the original list is exhausted, check for retry stations elif len(self.stations_retry): station_info = self.stations_retry.pop(0) # if there are no threads remaining all stations have been checked elif not len(self.threads): self.summarize() self.poller.stop() self.done = True #self.lock_poll.release() raise ExLoopDone, "No stations remaining." else: # This occurs when we have no stations # in either the default or retry queues, and # we have at least one but less than # self.max_threads still running. break station = None try: if ( station_info.has_key('disabled') and (station_info['disabled'] == 'true') ): raise ExStationDisabled, "Station is disabled" if ( station_info['type'] == 'Q680' ): if self.continuity_only: raise Exception("Continuity checks, Q680s not supported") if self.versions_only: raise Exception("Software version checks, Q680s not supported") station = Station680() elif ( station_info['type'] == 'Q330' ): station = Station330(legacy=False, continuity_only=self.continuity_only, versions_only=self.versions_only) station.set_version_queue(self.version_queue) station.set_version_files(self.version_files) elif ( station_info['type'] == 'Q330C' ): station = Station330(legacy=True, continuity_only=self.continuity_only, versions_only=self.versions_only) station.set_version_queue(self.version_queue) station.set_version_files(self.version_files) else: raise ExStationTypeNotRecognized, "Station type not recognized" self.prep_station(station, station_info) self.find_proxies(station, station_info) permissions = stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH date = time.gmtime() dir = self.output_directory + '/' + station.name try: if not os.path.exists(dir): os.makedirs(dir) if os.stat(dir).st_mode != permissions: os.chmod(dir, permissions) except: raise Exception, "CheckLoop::init_dir() could not create directory: %s" % dir dir += time.strftime("/%Y", date) try: if not os.path.exists(dir): os.makedirs(dir) if os.stat(dir).st_mode != permissions: os.chmod(dir, permissions) except: raise Exception, "CheckLoop::init_dir() could not create directory: %s" % dir dir += time.strftime("/%j", date) try: if not os.path.exists(dir): os.makedirs(dir) if os.stat(dir).st_mode != permissions: os.chmod(dir, permissions) except: raise Exception, "CheckLoop::init_dir() could not create directory: %s" % dir file = "%s/%s.log" % (dir, action) station.log_file_name(file) station.log_to_file() station.log_to_screen() station.set_output_directory(self.output_directory + '/' + station.name) if not station.min_info(): self.stations_partial.append(station_info) thread = ThreadStation() thread.set_station( station ) thread.set_info( station_info ) station._log( "Starting thread" ) thread.start() self.threads.append(thread) except Exception, e: if station: print "Loop::poll() failed to create thread. Exception: %s" % str(e) else: print "Loop::poll() failed to create station object. Exception: %s" % str(e) except ExLoopDone, e: print "All stations have been processed"
def start_scraping(threads_number): global config website_type = config.CLASS_TYPE_TROPICAIR global_sc_obj = Scraper( use_cache=False, #enable cache globally retries=3, use_default_logging=False) tropicair_depart_arrival_list = [] try: with open(config.AIRPORT_RELATIONSHIP_FILE) as csvfile: reader = csv.reader(csvfile) for i, item in enumerate(reader): if i > 0 and item[0] != "" and item[1] != "": obj = {} obj["Departure"] = item[0] obj["Arrival"] = item[1] obj["Type"] = item[2] if obj["Type"] == config.CLASS_TYPE_TROPICAIR_STR: tropicair_depart_arrival_list.append(obj) except Exception as e: print(e) return sc_obj_list = [] driver_list = [] for i in range(0, threads_number): driver, user_agent, proxy, screen_resolution = common_lib.create_phantomjs_driver( ) # PHANTOMJS PART driver_list.append({"driver": driver, "status": "none"}) tz = pytz.timezone('America/Los_Angeles') depart_arrival_list = tropicair_depart_arrival_list if len(depart_arrival_list) == 0: print('None depart arrival info') return threads = [] file_currentdate = datetime.now(tz) filename = "{}_{}.csv".format(common_lib.get_webiste_str(website_type), file_currentdate.strftime('%Y-%m-%d %H:%M')) for i, depart_arrival_info in enumerate(depart_arrival_list): currentdate = datetime.now(tz) print("Current Date & Time: {} , {}".format( currentdate.strftime('%Y-%m-%d'), currentdate.strftime('%H:%M'))) departure = depart_arrival_info["Departure"] arrival = depart_arrival_info["Arrival"] departure_abbr = "" arrival_abbr = "" start_step = 0 if website_type == config.CLASS_TYPE_TROPICAIR: departure_abbr = departure.split("-")[1].strip() arrival_abbr = arrival.split("-")[1].strip() date_list = [] no_result_info = {"Count": 0} for step in range(start_step, start_step + config.DAYS_TO_BE_SCRAPED): date_list.append({ "date": datetime.now(tz) + timedelta(days=step), "status": "none", "error_count": 0 }) stop_date_str = "" start_date_str = currentdate.strftime('%Y-%m-%d %H:%M') print "************************************" print len(date_list), departure, arrival print "************************************" while len(date_list) > 0: if len(threads) < threads_number: start_date = None phantom_obj = None if no_result_info["Count"] > config.MAX_NO_RESULT_COUNT: print("--------------------------") print("No result any more") print("--------------------------") break # print "+++++++++++++++++++++++++++++++++" # print driver_list # print "+++++++++++++++++++++++++++++++++" for date in date_list: if date["status"] == "complete": date_list.remove(date) elif date["status"] == "none": start_date = date start_date["status"] = "pending" break if len(date_list) == 0: break if start_date == None: continue for driver in driver_list: if driver["status"] == "none": phantom_obj = driver driver["status"] = "pending" break if phantom_obj == None: continue print("++++++++++++++++++++++++++++++") print("Depart List = " + str(len(depart_arrival_list)) + " Index =" + str(i)) # print (depart_arrival_info) print(departure_abbr + "," + arrival_abbr) print(start_date) # print driver_list print("++++++++++++++++++++++++++++++") stop_date_str = start_date["date"].strftime('%Y-%m-%d %H:%M') sleep(config.DRIVER_SHORT_WAITING_SECONDS) proxy_ip, proxy_port, proxy_user, proxy_pass = random_proxy() if proxy_user != None: auth_str = "{}:{}".format(proxy_user, proxy_pass) proxy = Proxy(proxy_ip, proxy_port, auth_str) else: proxy = Proxy(proxy_ip, proxy_port) class_obj = None class_obj = TropicAir(phantom_obj, start_date, departure, arrival, currentdate, tz, departure_abbr, arrival_abbr, no_result_info, filename) thread_obj = threading.Thread( target=class_obj.parse_website, args=(config.DRIVER_VALUE_PHANTOMJS, )) threads.append(thread_obj) thread_obj.start() for thread in threads: if not thread.is_alive(): thread.join() threads.remove(thread) print("*************************") print(len(date_list)) print(no_result_info) finishdate = datetime.now(tz) finish_date_str = finishdate.strftime('%Y-%m-%d %H:%M') global_sc_obj.save([ "Departure", departure, "Arrival", arrival, "Date Len", len(date_list), "No Result", no_result_info["Count"], "File Name", filename, "Start Date", start_date_str, "Finish", stop_date_str, "Capture Date", finish_date_str, ], "output/output_{}.csv".format(website_type)) print("*************************") try: common_lib.upload_file(filename, "output/") print "Upload", departure, arrival except: print("Error while upload :" + filename)
def threaded_get(url=None, urls=None, url_iter=None, num_threads=10, dl=None, cb=None, depth=True, **kwargs): """Download these urls in parallel url: the webpage to download urls: the webpages to download num_threads: the number of threads to download urls with cb: Called after each download with the HTML of the download. The arguments are the url and downloaded html. Whatever URLs are returned are added to the crawl queue. dl: A callback for customizing the download. Takes the download object and url and should return the HTML. depth: True for depth first search """ running = True lock = threading.Lock() def add_iter_urls(): if lock.acquire(False): for url in url_iter or []: download_queue.append(url) break lock.release() def process_queue(): """Thread for downloading webpages """ D = Download(**kwargs) while True: try: url = download_queue.pop() if depth else download_queue.popleft() except IndexError: add_iter_urls() break else: # download this url html = dl(D, url, **kwargs) if dl else D.get(url, **kwargs) if cb: try: # use callback to process downloaded HTML result = cb(D, url, html) except StopCrawl: common.logger.info('Stopping crawl signal') running = False except Exception: # catch any callback error to avoid losing thread common.logger.exception('\nIn callback for: ' + str(url)) else: # add these URL's to crawl queue for link in result or []: download_queue.append(urlparse.urljoin(url, link)) # update the crawler state # no download or error so must have read from cache num_caches = 0 if D.num_downloads or D.num_errors else 1 state.update(num_downloads=D.num_downloads, num_errors=D.num_errors, num_caches=num_caches, queue_size=len(download_queue)) download_queue = collections.deque() if urls: download_queue.extend(urls) if url: download_queue.append(url) common.logger.debug('Start new crawl') # initiate the state file with the number of URL's already in the queue state = State() state.update(queue_size=len(download_queue)) # wait for all download threads to finish threads = [] while running and (threads or download_queue): for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < num_threads and download_queue: # cat start more threads thread = threading.Thread(target=process_queue) thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME) # save the final state after threads finish state.save()
def threaded_get(url=None, urls=None, num_threads=10, dl=None, cb=None, depth=None, wait_finish=True, reuse_queue=False, max_queue=1000, **kwargs): """Download these urls in parallel url: the webpage to download urls: the webpages to download num_threads: the number of threads to download urls with cb: Called after each download with the HTML of the download. The arguments are the url and downloaded html. Whatever URLs are returned are added to the crawl queue. dl: A callback for customizing the download. Takes the download object and url and should return the HTML. depth: Deprecated - will be removed in later version wait_finish: whether to wait until all download threads have finished before returning reuse_queue: Whether to continue the queue from the previous run. max_queue: The maximum number of queued URLs to keep in memory. The rest will be in the cache. """ if kwargs.pop('cache', None): common.logger.debug('threaded_get does not support cache flag') lock = threading.Lock() class DownloadThread(threading.Thread): """Thread for downloading webpages """ processing = collections.deque() # to track whether are still downloading discovered = {} # the URL's that have been discovered def __init__(self): threading.Thread.__init__(self) def run(self): D = Download(**kwargs) queue = pdict.Queue(settings.queue_file) while seed_urls or DownloadThread.processing: # keep track that are processing url DownloadThread.processing.append(1) try: url = seed_urls.pop() except IndexError: # currently no urls to process DownloadThread.processing.popleft() # so check again later time.sleep(SLEEP_TIME) else: try: # download this url html = dl(D, url, **kwargs) if dl else D.get(url, **kwargs) if cb: try: # use callback to process downloaded HTML result = cb(D, url, html) except Exception, e: # catch any callback error to avoid losing thread common.logger.exception('\nIn callback for: ' + str(url)) else: # add these URL's to crawl queue for link in result or []: cb_url = urlparse.urljoin(url, link) if isinstance(result, dict): DownloadThread.discovered[cb_url] = result[link] else: DownloadThread.discovered[cb_url] = DEFAULT_PRIORITY if len(seed_urls) < max_queue: # need to request more queue if DownloadThread.discovered or len(queue) > 0: # there are outstanding in the queue if lock.acquire(False): # no other thread is downloading common.logger.debug('Loading from queue: %d' % len(seed_urls)) discovered = [] while DownloadThread.discovered: discovered.append(DownloadThread.discovered.popitem()) queue.push(discovered) # get next batch of URLs from cache seed_urls.extend(queue.pull(limit=max_queue)) lock.release() finally: # have finished processing # make sure this is called even on exception to avoid eternal loop DownloadThread.processing.pop() # update the crawler state # no download or error so must have read from cache num_caches = 0 if D.num_downloads or D.num_errors else 1 state.update(num_downloads=D.num_downloads, num_errors=D.num_errors, num_caches=num_caches, queue_size=len(queue)) queue = pdict.Queue(settings.queue_file) if reuse_queue: # command line flag to enable queue queued_urls = queue.pull(limit=max_queue) else: queued_urls = [] if queued_urls: # continue the previous crawl seed_urls = collections.deque(queued_urls) common.logger.debug('Loading crawl queue') else: # remove any queued URL's so can crawl again queue.clear() urls = urls or [] if url: urls.append(url) queue.push([(url, DEFAULT_PRIORITY) for url in urls]) # put urls into thread safe queue seed_urls = collections.deque(queue.pull(limit=max_queue)) common.logger.debug('Start new crawl') # initiate the state file with the number of URL's already in the queue state = State() state.update(queue_size=len(queue)) # start the download threads threads = [DownloadThread() for i in range(num_threads)] for thread in threads: thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c thread.start() # Wait for all download threads to finish while threads and wait_finish: for thread in threads: if not thread.is_alive(): threads.remove(thread) time.sleep(SLEEP_TIME) # save the final state after threads finish state.save()
def threaded_get(url=None, urls=None, url_iter=None, num_threads=10, dl=None, cb=None, depth=True, **kwargs): """Download these urls in parallel url: the webpage to download urls: the webpages to download num_threads: the number of threads to download urls with cb: Called after each download with the HTML of the download. The arguments are the url and downloaded html. Whatever URLs are returned are added to the crawl queue. dl: A callback for customizing the download. Takes the download object and url and should return the HTML. depth: True for depth first search """ running = True lock = threading.Lock() def add_iter_urls(): if lock.acquire(False): for url in url_iter or []: download_queue.append(url) break lock.release() def process_queue(): """Thread for downloading webpages """ D = Download(**kwargs) while True: try: url = download_queue.pop( ) if depth else download_queue.popleft() except IndexError: add_iter_urls() break else: # download this url html = dl(D, url, **kwargs) if dl else D.get(url, **kwargs) if cb: try: # use callback to process downloaded HTML result = cb(D, url, html) except StopCrawl: common.logger.info('Stopping crawl signal') self.running = False except Exception: # catch any callback error to avoid losing thread common.logger.exception('\nIn callback for: ' + str(url)) else: # add these URL's to crawl queue for link in result or []: download_queue.append(link) # update the crawler state # no download or error so must have read from cache num_caches = 0 if D.num_downloads or D.num_errors else 1 state.update(num_downloads=D.num_downloads, num_errors=D.num_errors, num_caches=num_caches, queue_size=len(download_queue)) download_queue = collections.deque() if urls: download_queue.extend(urls) if url: download_queue.append(url) common.logger.debug('Start new crawl') # initiate the state file with the number of URL's already in the queue state = State() state.update(queue_size=len(download_queue)) # wait for all download threads to finish threads = [] while running and (threads or download_queue): for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < num_threads and download_queue: # cat start more threads thread = threading.Thread(target=process_queue) thread.setDaemon( True ) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME) # save the final state after threads finish state.save()
def threaded_get(url=None, urls=None, num_threads=10, dl=None, cb=None, depth=None, wait_finish=True, reuse_queue=False, max_queue=1000, **kwargs): """Download these urls in parallel url: the webpage to download urls: the webpages to download num_threads: the number of threads to download urls with cb: Called after each download with the HTML of the download. The arguments are the url and downloaded html. Whatever URLs are returned are added to the crawl queue. dl: A callback for customizing the download. Takes the download object and url and should return the HTML. depth: Deprecated - will be removed in later version wait_finish: whether to wait until all download threads have finished before returning use_queue: Whether to continue the queue from the previous run. max_queue: The maximum number of queued URLs to keep in memory. The rest will be in the cache. """ if kwargs.pop('cache', None): common.logger.debug('threaded_get does not support cache flag') lock = threading.Lock() class DownloadThread(threading.Thread): """Thread for downloading webpages """ processing = collections.deque( ) # to track whether are still downloading discovered = {} # the URL's that have been discovered def __init__(self): threading.Thread.__init__(self) def run(self): D = Download(**kwargs) queue = pdict.Queue(settings.queue_file) while seed_urls or DownloadThread.processing: # keep track that are processing url DownloadThread.processing.append(1) try: url = seed_urls.pop() except IndexError: # currently no urls to processa DownloadThread.processing.popleft() # so check again later time.sleep(SLEEP_TIME) else: try: # download this url html = dl(D, url, **kwargs) if dl else D.get( url, **kwargs) if cb: try: # use callback to process downloaded HTML cb_urls = cb(D, url, html) except Exception, e: # catch any callback error to avoid losing thread common.logger.error('in callback for: ' + str(url) + '\n' + traceback.format_exc()) else: # add these URL's to crawl queue for cb_url in cb_urls or []: if isinstance(cb_urls, dict): DownloadThread.discovered[ cb_url] = cb_urls[cb_url] else: DownloadThread.discovered[ cb_url] = DEFAULT_PRIORITY if len(seed_urls) < max_queue: # need to request more queue if DownloadThread.discovered or len( queue) > 0: # there are outstanding in the queue if lock.acquire(False): # no other thread is downloading common.logger.debug( 'Loading from queue: %d' % len(seed_urls)) discovered = [] while DownloadThread.discovered: discovered.append( DownloadThread.discovered. popitem()) queue.push(discovered) # get next batch of URLs from cache seed_urls.extend( queue.pull(limit=max_queue)) lock.release() """ for cb_url in cb_urls or []: if cb_url not in DownloadThread.discovered: DownloadThread.discovered[cb_url] = 1 seed_urls.append(cb_url) """ finally: # have finished processing # make sure this is called even on exception to avoid eternal loop DownloadThread.processing.pop() # update the crawler state # no download or error so must have read from cache num_caches = 0 if D.num_downloads or D.num_errors else 1 state.update(num_downloads=D.num_downloads, num_errors=D.num_errors, num_caches=num_caches, queue_size=len(queue)) queue = pdict.Queue(settings.queue_file) if reuse_queue: # command line flag to enable queue queued_urls = queue.pull(limit=max_queue) else: queued_urls = [] if queued_urls: # continue the previous crawl seed_urls = collections.deque(queued_urls) common.logger.debug('Loading crawl queue') else: # remove any queued URL's so can crawl again queue.clear() urls = urls or [] if url: urls.append(url) queue.push([(url, DEFAULT_PRIORITY) for url in urls]) # put urls into thread safe queue seed_urls = collections.deque(queue.pull(limit=max_queue)) common.logger.debug('Start new crawl') # initiate the state file with the number of URL's already in the queue state = State() state.update(queue_size=len(queue)) # start the download threads threads = [DownloadThread() for i in range(num_threads)] for thread in threads: thread.setDaemon( True) # set daemon so main thread can exit when receives ctrl-c thread.start() # Wait for all download threads to finish while threads and wait_finish: for thread in threads: if not thread.is_alive(): threads.remove(thread) time.sleep(SLEEP_TIME) # save the final state after threads finish state.save()