def qumulo_treewalk(path, ip, ses, q_crawl, num_sep, level, batchsize, cliargs, logger, reindex_dict): batch = [] dircount = 0 totaldirs = 0 totalfiles = 0 starttime = time.time() # queue for paths q_paths = PyQueue() q_paths_results = PyQueue() lock = Lock() # set up threads for tree walk for i in range(cliargs['walkthreads']): t = Thread(target=apiwalk_worker, args=(ip, ses, q_paths, q_paths_results, lock,)) t.daemon = True t.start() # set up progress bar if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: widgets = [progressbar.AnimatedMarker(), ' Crawling (Queue: ', progressbar.Counter(), progressbar.FormatLabel(''), ') ', progressbar.Timer()] bar = progressbar.ProgressBar(widgets=widgets, max_value=progressbar.UnknownLength) bar.start() else: bar = None bartimestamp = time.time() for root, dirs, files in qumulo_api_walk(path, ip, ses, q_paths, q_paths_results): dircount += 1 totaldirs += 1 files_len = len(files) dirs_len = len(dirs) totalfiles += files_len if dirs_len == 0 and files_len == 0 and not cliargs['indexemptydirs']: continue if root['path'] != '/': root_path = root['path'].rstrip(os.path.sep) else: root_path = root['path'] if not dir_excluded(root_path, config, cliargs): batch.append((root, dirs, files)) batch_len = len(batch) if batch_len >= batchsize or (cliargs['adaptivebatch'] and totalfiles >= config['adaptivebatch_maxfiles']): q_crawl.enqueue(scrape_tree_meta, args=(batch, cliargs, reindex_dict,), result_ttl=config['redis_ttl']) if cliargs['debug'] or cliargs['verbose']: logger.info("enqueued batchsize: %s (batchsize: %s)" % (batch_len, batchsize)) del batch[:] if cliargs['adaptivebatch']: batchsize = adaptive_batch(q_crawl, cliargs, batchsize) if cliargs['debug'] or cliargs['verbose']: logger.info("batchsize set to: %s" % batchsize) # check if at maxdepth level and delete dirs/files lists to not # descend further down the tree if cliargs['maxdepth']: num_sep_this = root_path.count(os.path.sep) if num_sep + level <= num_sep_this: del dirs[:] del files[:] else: # directory excluded del dirs[:] del files[:] # update progress bar if bar: try: if time.time() - bartimestamp >= 2: elapsed = round(time.time() - bartimestamp, 3) dirspersec = round(dircount / elapsed, 3) widgets[4] = progressbar.FormatLabel(', ' + str(dirspersec) + ' dirs/sec) ') bartimestamp = time.time() dircount = 0 bar.update(len(q_crawl)) except (ZeroDivisionError, ValueError): bar.update(0) # add any remaining in batch to queue q_crawl.enqueue(scrape_tree_meta, args=(batch, cliargs, reindex_dict,), result_ttl=config['redis_ttl']) # set up progress bar with time remaining if bar: bar.finish() bar_max_val = len(q_crawl) bar = progressbar.ProgressBar(max_value=bar_max_val) bar.start() else: bar = None # update progress bar until bots are idle and queue is empty while worker_bots_busy([q_crawl]): if bar: q_len = len(q_crawl) try: bar.update(bar_max_val - q_len) except (ZeroDivisionError, ValueError): bar.update(0) time.sleep(1) if bar: bar.finish() elapsed = round(time.time() - starttime, 3) dirspersec = round(totaldirs / elapsed, 3) logger.info("Finished crawling, elapsed time %s sec, dirs walked %s (%s dirs/sec)" % (elapsed, totaldirs, dirspersec))
def socket_thread_handler_twc(threadnum, q, q_kill, lock, rootdir, num_sep, level, batchsize, cliargs, logger, reindex_dict): """This is the socket thread handler tree walk client function. Stream of directory listings (pickle) from diskover treewalk client connections are enqueued to redis rq queue. """ while True: try: c = q.get() clientsock, addr = c logger.debug(clientsock) logger.debug(addr) totalfiles = 0 while True: data = recv_one_message(clientsock) if not data: break if data == b'SIGKILL' or data == 'SIGKILL': q_kill.put(b'SIGKILL') break # unpickle data sent from client data_decoded = pickle.loads(data) logger.debug(data_decoded) # enqueue to redis batch = [] for root, dirs, files in data_decoded: files_len = len(files) totalfiles += files_len # check for empty dirs if len(dirs) == 0 and len( files) == 0 and not cliargs['indexemptydirs']: continue batch.append((root, dirs, files)) batch_len = len(batch) if batch_len >= batchsize or ( cliargs['adaptivebatch'] and totalfiles >= config['adaptivebatch_maxfiles']): q_crawl.enqueue(scrape_tree_meta, args=( batch, cliargs, reindex_dict, ), result_ttl=config['redis_ttl']) if cliargs['debug'] or cliargs['verbose']: logger.info( "enqueued batchsize: %s (batchsize: %s)" % (batch_len, batchsize)) del batch[:] totalfiles = 0 if cliargs['adaptivebatch']: batchsize = adaptive_batch(q_crawl, cliargs, batchsize) if cliargs['debug'] or cliargs['verbose']: logger.info("batchsize set to: %s" % batchsize) if len(batch) > 0: # add any remaining in batch to queue q_crawl.enqueue(scrape_tree_meta, args=( batch, cliargs, reindex_dict, ), result_ttl=config['redis_ttl']) del batch[:] # close connection to client clientsock.close() logger.info("[thread-%s]: %s closed connection" % (threadnum, str(addr))) q.task_done() except socket.error as e: logger.error("[thread-%s]: Socket error (%s)" % (threadnum, e))
def socket_thread_handler_twc(threadnum, q, q_kill, rootdir, num_sep, level, batchsize, cliargs, logger, reindex_dict): """This is the socket thread handler tree walk client function. Stream of directory listings (pickle) from diskover treewalk client connections are enqueued to redis rq queue. """ while True: try: c = q.get() clientsock, addr = c logger.debug(clientsock) logger.debug(addr) while True: data = recv_one_message(clientsock) #logger.debug(data) if not data: break if data == b'SIGKILL' or data == 'SIGKILL': q_kill.put(b'SIGKILL') break data_decoded = pickle.loads(data) logger.debug(data_decoded) # enqueue to redis batch = [] for root, dirs, files in data_decoded: if len(dirs) == 0 and len( files) == 0 and not cliargs['indexemptydirs']: continue # check if meta stat data has been embeded in the data from client if type(root) is tuple: rootpath = root[0] else: rootpath = root if not dir_excluded(rootpath, config, cliargs['verbose']): batch.append((root, dirs, files)) batch_len = len(batch) if batch_len >= batchsize: q_crawl.enqueue(scrape_tree_meta, args=( batch, cliargs, reindex_dict, )) del batch[:] if cliargs['adaptivebatch']: batchsize = adaptive_batch( q_crawl, cliargs, batchsize) # check if at maxdepth level and delete dirs/files lists to not # descend further down the tree num_sep_this = rootpath.count(os.path.sep) if num_sep + level <= num_sep_this: del dirs[:] del files[:] else: # directory excluded del dirs[:] del files[:] if len(batch) > 0: # add any remaining in batch to queue q_crawl.enqueue(scrape_tree_meta, args=( batch, cliargs, reindex_dict, )) del batch[:] # close connection to client clientsock.close() logger.info("[thread-%s]: %s closed connection" % (threadnum, str(addr))) q.task_done() except socket.error as e: logger.error("[thread-%s]: Socket error (%s)" % (threadnum, e))
def dupes_finder(es, q, cliargs, logger): """This is the duplicate file finder function. It searches Elasticsearch for files that have the same filehashes and adds file hash groups to Queue. """ logger.info('Searching %s for all dupe filehashes...', cliargs['index']) # first get all the filehashes with files that have a hardlinks count of 1 if cliargs['inchardlinks']: data = { "size": 0, "_source": [ 'filename', 'filehash', 'path_parent', 'last_modified', 'last_access' ], "query": { "bool": { "must": { "range": { "filesize": { "lte": config['dupes_maxsize'], "gte": cliargs['minsize'] } } } } } } else: data = { "size": 0, "_source": [ 'filename', 'filehash', 'path_parent', 'last_modified', 'last_access' ], "query": { "bool": { "must": { "term": { "hardlinks": 1 } }, "filter": { "range": { "filesize": { "lte": config['dupes_maxsize'], "gte": cliargs['minsize'] } } } } } } # refresh index es.indices.refresh(index=cliargs['index']) # search es and start scroll res = es.search(index=cliargs['index'], doc_type='file', scroll='1m', size=config['es_scrollsize'], body=data, request_timeout=config['es_timeout']) filehashes = {} while res['hits']['hits'] and len(res['hits']['hits']) > 0: for hit in res['hits']['hits']: filehash = hit['_source']['filehash'] filepath = os.path.join(hit['_source']['path_parent'], hit['_source']['filename']) if filehash in filehashes: filehashes[filehash].append({ 'id': hit['_id'], 'filename': filepath, 'atime': hit['_source']['last_access'], 'mtime': hit['_source']['last_modified'], 'md5': '' }) else: filehashes[filehash] = [{ 'id': hit['_id'], 'filename': filepath, 'atime': hit['_source']['last_access'], 'mtime': hit['_source']['last_modified'], 'md5': '' }] # use es scroll api res = es.scroll(scroll_id=res['_scroll_id'], scroll='1m', request_timeout=config['es_timeout']) possibledupescount = 0 for key, value in list(filehashes.items()): filehash_filecount = len(value) if filehash_filecount < 2: del filehashes[key] else: possibledupescount += filehash_filecount logger.info('Found %s possible dupe files', possibledupescount) if possibledupescount == 0: return logger.info('Starting to enqueue dupe file hashes...') if cliargs['adaptivebatch']: batchsize = ab_start else: batchsize = cliargs['batchsize'] if cliargs['verbose'] or cliargs['debug']: logger.info('Batch size: %s' % batchsize) n = 0 hashgroups = [] for key, value in filehashes.items(): if cliargs['verbose'] or cliargs['debug']: logger.info('filehash: %s, filecount: %s' % (key, len(value))) hashgroups.append({'filehash': key, 'files': value}) n += 1 if n >= batchsize: # send to rq for bots to process hashgroups list q.enqueue(dupes_process_hashkeys, args=( hashgroups, cliargs, ), result_ttl=config['redis_ttl']) if cliargs['debug'] or cliargs['verbose']: logger.info("enqueued batchsize: %s (batchsize: %s)" % (n, batchsize)) del hashgroups[:] n = 0 if cliargs['adaptivebatch']: batchsize = adaptive_batch(q, cliargs, batchsize) if cliargs['debug'] or cliargs['verbose']: logger.info("batchsize set to: %s" % batchsize) # enqueue dir calc job for any remaining in dirlist if n > 0: q.enqueue(dupes_process_hashkeys, args=( hashgroups, cliargs, ), result_ttl=config['redis_ttl']) logger.info( '%s possible dupe file hashes have been enqueued, worker bots processing dupes...' % possibledupescount) if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: bar = progress_bar('Checking') bar.start() else: bar = None # update progress bar until bots are idle and queue is empty while worker_bots_busy([q]): if bar: q_len = len(q) try: bar.update(q_len) except (ZeroDivisionError, ValueError): bar.update(0) time.sleep(1) if bar: bar.finish()
def dupes_finder(es, q, cliargs, logger): """This is the duplicate file finder function. It searches Elasticsearch for files that have the same filehashes and adds file hash groups to Queue. """ logger.info('Searching %s for all duplicate files...', cliargs['index']) if cliargs['adaptivebatch']: batchsize = ab_start else: batchsize = cliargs['batchsize'] if cliargs['verbose'] or cliargs['debug']: logger.info('Batch size: %s' % batchsize) # first get all the filehashes with files that have a hardlinks count of 1 data = { "size": 0, "query": { "bool": { "must": { "term": {"hardlinks": 1} }, "filter": { "range": { "filesize": { "lte": config['dupes_maxsize'], "gte": cliargs['minsize'] } } } } } } # refresh index es.indices.refresh(index=cliargs['index']) # search es and start scroll res = es.search(index=cliargs['index'], scroll='1m', doc_type='file', size=config['es_scrollsize'], body=data, request_timeout=config['es_timeout']) filehashlist = [] filehashcount = 0 while res['hits']['hits'] and len(res['hits']['hits']) > 0: for hit in res['hits']['hits']: filehash = hit['_source']['filehash'] if filehash not in filehashlist: filehashlist.append(filehash) filehashcount += 1 filehashlist_len = len(filehashlist) if filehashlist_len >= batchsize: # send to rq for bots to process file hashkey list q.enqueue(dupes_process_hashkey, args=(filehashlist, cliargs,), result_ttl=config['redis_ttl']) if cliargs['debug'] or cliargs['verbose']: logger.info("enqueued batchsize: %s (batchsize: %s)" % (filehashlist_len, batchsize)) del filehashlist[:] if cliargs['adaptivebatch']: batchsize = adaptive_batch(q, cliargs, batchsize) if cliargs['debug'] or cliargs['verbose']: logger.info("batchsize set to: %s" % batchsize) # use es scroll api res = es.scroll(scroll_id=res['_scroll_id'], scroll='1m', request_timeout=config['es_timeout']) # enqueue dir calc job for any remaining in dirlist if len(filehashlist) > 0: q.enqueue(dupes_process_hashkey, args=(filehashlist, cliargs,), result_ttl=config['redis_ttl']) logger.info('%s file hashes have been enqueued' % filehashcount) if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: bar = progress_bar('Checking') bar.start() else: bar = None # update progress bar until bots are idle and queue is empty while worker_bots_busy([q]): if bar: q_len = len(q) try: bar.update(q_len) except (ZeroDivisionError, ValueError): bar.update(0) time.sleep(1) if bar: bar.finish()