def qumulo_treewalk(path, lock, ip, ses, num_sep, level, totaljobs, batchsize, cliargs, logger, reindex_dict): batch = [] for root, dirs, files in qumulo_api_walk(path, ip, ses): if root['path'] != '/': root_path = root['path'].rstrip(os.path.sep) else: root_path = root['path'] if not diskover.dir_excluded(root_path, diskover.config, cliargs['verbose']): if len(dirs) == 0 and len( files) == 0 and not cliargs['indexemptydirs']: continue batch.append((root, files)) if len(batch) >= batchsize: diskover.q.enqueue(diskover_worker_bot.scrape_tree_meta, args=( batch, cliargs, reindex_dict, )) with lock: totaljobs.value += 1 del batch[:] batchsize_prev = batchsize if cliargs['adaptivebatch']: if len(diskover.q) == 0: if (batchsize - 10) >= diskover.adaptivebatch_startsize: batchsize = batchsize - 10 elif len(diskover.q) > 0: if (batchsize + 10) <= diskover.adaptivebatch_maxsize: batchsize = batchsize + 10 cliargs['batchsize'] = batchsize if cliargs['verbose'] or cliargs['debug']: if batchsize_prev != batchsize: logger.info('Batch size: %s' % batchsize) # check if at maxdepth level and delete dirs/files lists to not # descend further down the tree num_sep_this = root_path.count(os.path.sep) if num_sep + level <= num_sep_this: del dirs[:] del files[:] else: # directory excluded del dirs[:] del files[:] # add any remaining in batch to queue diskover.q.enqueue(diskover_worker_bot.scrape_tree_meta, args=( batch, cliargs, reindex_dict, )) with lock: totaljobs.value += 1
def qumulo_treewalk(path, ip, ses, num_sep, level, batchsize, bar, cliargs, reindex_dict): batch = [] for root, dirs, files in qumulo_api_walk(path, ip, ses): if root['path'] != '/': root_path = root['path'].rstrip(os.path.sep) else: root_path = root['path'] if not diskover.dir_excluded(root_path, diskover.config, cliargs['verbose']): if len(dirs) == 0 and len(files) == 0 and not cliargs['indexemptydirs']: continue batch.append((root, files)) if len(batch) >= batchsize: diskover.q.enqueue(diskover_worker_bot.scrape_tree_meta, args=(batch, cliargs, reindex_dict,)) diskover.totaljobs += 1 del batch[:] batchsize_prev = batchsize if cliargs['adaptivebatch']: if len(diskover.q) == 0: if (batchsize - 10) >= diskover.adaptivebatch_startsize: batchsize = batchsize - 10 elif len(diskover.q) > 0: if (batchsize + 10) <= diskover.adaptivebatch_maxsize: batchsize = batchsize + 10 cliargs['batchsize'] = batchsize if cliargs['verbose'] or cliargs['debug']: if batchsize_prev != batchsize: diskover.logger.info('Batch size: %s' % batchsize) # check if at maxdepth level and delete dirs/files lists to not # descend further down the tree num_sep_this = root_path.count(os.path.sep) if num_sep + level <= num_sep_this: del dirs[:] del files[:] else: # directory excluded del dirs[:] del files[:] if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: try: percent = int("{0:.0f}".format(100 * ((diskover.totaljobs - len(diskover.q)) / float(diskover.totaljobs)))) bar.update(percent) except ZeroDivisionError: bar.update(0) except ValueError: bar.update(0) # add any remaining in batch to queue diskover.q.enqueue(diskover_worker_bot.scrape_tree_meta, args=(batch, cliargs, reindex_dict,)) diskover.totaljobs += 1
def qumulo_treewalk(path, ip, ses, q_crawl, num_sep, level, batchsize, cliargs, reindex_dict, bar): batch = [] for root, dirs, files in qumulo_api_walk(path, ip, ses): if len(dirs) == 0 and len(files) == 0 and not cliargs['indexemptydirs']: continue if root['path'] != '/': root_path = root['path'].rstrip(os.path.sep) else: root_path = root['path'] if not diskover.dir_excluded(root_path, diskover.config, cliargs['verbose']): batch.append((root, files)) if len(batch) >= batchsize: q_crawl.enqueue(diskover_worker_bot.scrape_tree_meta, args=(batch, cliargs, reindex_dict,)) del batch[:] batchsize_prev = batchsize if cliargs['adaptivebatch']: q_len = len(q_crawl) if q_len == 0: if (batchsize - diskover.ab_step) >= diskover.ab_start: batchsize = batchsize - diskover.ab_step elif q_len > 0: if (batchsize + diskover.ab_step) <= diskover.ab_max: batchsize = batchsize + diskover.ab_step cliargs['batchsize'] = batchsize if cliargs['verbose'] or cliargs['debug']: if batchsize_prev != batchsize: diskover.logger.info('Batch size: %s' % batchsize) # check if at maxdepth level and delete dirs/files lists to not # descend further down the tree num_sep_this = root_path.count(os.path.sep) if num_sep + level <= num_sep_this: del dirs[:] del files[:] else: # directory excluded del dirs[:] del files[:] # update progress bar if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: try: bar.update(len(q_crawl)) except ZeroDivisionError: bar.update(0) except ValueError: bar.update(0) # add any remaining in batch to queue q_crawl.enqueue(diskover_worker_bot.scrape_tree_meta, args=(batch, cliargs, reindex_dict,))
def qumulo_api_walk(path, q_paths, q_paths_results, cliargs): q_paths.put(path) while True: entry = q_paths_results.get() root, dirs, nondirs = entry # yield before recursion yield root, dirs, nondirs # recurse into subdirectories for name in dirs: new_path = os.path.join(root, name) if not dir_excluded(new_path, config, cliargs): q_paths.put(new_path) q_paths_results.task_done() if q_paths_results.qsize() == 0 and q_paths.qsize() == 0: time.sleep(.5) if q_paths_results.qsize() == 0 and q_paths.qsize() == 0: break
def qumulo_treewalk(path, ip, ses, q_crawl, num_sep, level, batchsize, cliargs, logger, reindex_dict): batch = [] dircount = 0 totaldirs = 0 totalfiles = 0 starttime = time.time() # queue for paths q_paths = PyQueue() q_paths_results = PyQueue() lock = Lock() # set up threads for tree walk for i in range(cliargs['walkthreads']): t = Thread(target=apiwalk_worker, args=(ip, ses, q_paths, q_paths_results, lock,)) t.daemon = True t.start() # set up progress bar if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: widgets = [progressbar.AnimatedMarker(), ' Crawling (Queue: ', progressbar.Counter(), progressbar.FormatLabel(''), ') ', progressbar.Timer()] bar = progressbar.ProgressBar(widgets=widgets, max_value=progressbar.UnknownLength) bar.start() else: bar = None bartimestamp = time.time() for root, dirs, files in qumulo_api_walk(path, ip, ses, q_paths, q_paths_results): dircount += 1 totaldirs += 1 files_len = len(files) dirs_len = len(dirs) totalfiles += files_len if dirs_len == 0 and files_len == 0 and not cliargs['indexemptydirs']: continue if root['path'] != '/': root_path = root['path'].rstrip(os.path.sep) else: root_path = root['path'] if not dir_excluded(root_path, config, cliargs): batch.append((root, dirs, files)) batch_len = len(batch) if batch_len >= batchsize or (cliargs['adaptivebatch'] and totalfiles >= config['adaptivebatch_maxfiles']): q_crawl.enqueue(scrape_tree_meta, args=(batch, cliargs, reindex_dict,), result_ttl=config['redis_ttl']) if cliargs['debug'] or cliargs['verbose']: logger.info("enqueued batchsize: %s (batchsize: %s)" % (batch_len, batchsize)) del batch[:] if cliargs['adaptivebatch']: batchsize = adaptive_batch(q_crawl, cliargs, batchsize) if cliargs['debug'] or cliargs['verbose']: logger.info("batchsize set to: %s" % batchsize) # check if at maxdepth level and delete dirs/files lists to not # descend further down the tree if cliargs['maxdepth']: num_sep_this = root_path.count(os.path.sep) if num_sep + level <= num_sep_this: del dirs[:] del files[:] else: # directory excluded del dirs[:] del files[:] # update progress bar if bar: try: if time.time() - bartimestamp >= 2: elapsed = round(time.time() - bartimestamp, 3) dirspersec = round(dircount / elapsed, 3) widgets[4] = progressbar.FormatLabel(', ' + str(dirspersec) + ' dirs/sec) ') bartimestamp = time.time() dircount = 0 bar.update(len(q_crawl)) except (ZeroDivisionError, ValueError): bar.update(0) # add any remaining in batch to queue q_crawl.enqueue(scrape_tree_meta, args=(batch, cliargs, reindex_dict,), result_ttl=config['redis_ttl']) # set up progress bar with time remaining if bar: bar.finish() bar_max_val = len(q_crawl) bar = progressbar.ProgressBar(max_value=bar_max_val) bar.start() else: bar = None # update progress bar until bots are idle and queue is empty while worker_bots_busy([q_crawl]): if bar: q_len = len(q_crawl) try: bar.update(bar_max_val - q_len) except (ZeroDivisionError, ValueError): bar.update(0) time.sleep(1) if bar: bar.finish() elapsed = round(time.time() - starttime, 3) dirspersec = round(totaldirs / elapsed, 3) logger.info("Finished crawling, elapsed time %s sec, dirs walked %s (%s dirs/sec)" % (elapsed, totaldirs, dirspersec))
def socket_thread_handler_twc(threadnum, q, q_kill, rootdir, num_sep, level, batchsize, cliargs, logger, reindex_dict): """This is the socket thread handler tree walk client function. Stream of directory listings (pickle) from diskover treewalk client connections are enqueued to redis rq queue. """ while True: try: c = q.get() clientsock, addr = c logger.debug(clientsock) logger.debug(addr) while True: data = recv_one_message(clientsock) #logger.debug(data) if not data: break if data == b'SIGKILL' or data == 'SIGKILL': q_kill.put(b'SIGKILL') break data_decoded = pickle.loads(data) logger.debug(data_decoded) # enqueue to redis batch = [] for root, dirs, files in data_decoded: if len(dirs) == 0 and len( files) == 0 and not cliargs['indexemptydirs']: continue # check if meta stat data has been embeded in the data from client if type(root) is tuple: rootpath = root[0] else: rootpath = root if not dir_excluded(rootpath, config, cliargs['verbose']): batch.append((root, dirs, files)) batch_len = len(batch) if batch_len >= batchsize: q_crawl.enqueue(scrape_tree_meta, args=( batch, cliargs, reindex_dict, )) del batch[:] if cliargs['adaptivebatch']: batchsize = adaptive_batch( q_crawl, cliargs, batchsize) # check if at maxdepth level and delete dirs/files lists to not # descend further down the tree num_sep_this = rootpath.count(os.path.sep) if num_sep + level <= num_sep_this: del dirs[:] del files[:] else: # directory excluded del dirs[:] del files[:] if len(batch) > 0: # add any remaining in batch to queue q_crawl.enqueue(scrape_tree_meta, args=( batch, cliargs, reindex_dict, )) del batch[:] # close connection to client clientsock.close() logger.info("[thread-%s]: %s closed connection" % (threadnum, str(addr))) q.task_done() except socket.error as e: logger.error("[thread-%s]: Socket error (%s)" % (threadnum, e))