def qumulo_treewalk(path, ip, ses, q_crawl, num_sep, level, batchsize, cliargs, logger, reindex_dict): batch = [] dircount = 0 totaldirs = 0 totalfiles = 0 starttime = time.time() # queue for paths q_paths = PyQueue() q_paths_results = PyQueue() lock = Lock() # set up threads for tree walk for i in range(cliargs['walkthreads']): t = Thread(target=apiwalk_worker, args=(ip, ses, q_paths, q_paths_results, lock,)) t.daemon = True t.start() # set up progress bar if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: widgets = [progressbar.AnimatedMarker(), ' Crawling (Queue: ', progressbar.Counter(), progressbar.FormatLabel(''), ') ', progressbar.Timer()] bar = progressbar.ProgressBar(widgets=widgets, max_value=progressbar.UnknownLength) bar.start() else: bar = None bartimestamp = time.time() for root, dirs, files in qumulo_api_walk(path, ip, ses, q_paths, q_paths_results): dircount += 1 totaldirs += 1 files_len = len(files) dirs_len = len(dirs) totalfiles += files_len if dirs_len == 0 and files_len == 0 and not cliargs['indexemptydirs']: continue if root['path'] != '/': root_path = root['path'].rstrip(os.path.sep) else: root_path = root['path'] if not dir_excluded(root_path, config, cliargs): batch.append((root, dirs, files)) batch_len = len(batch) if batch_len >= batchsize or (cliargs['adaptivebatch'] and totalfiles >= config['adaptivebatch_maxfiles']): q_crawl.enqueue(scrape_tree_meta, args=(batch, cliargs, reindex_dict,), result_ttl=config['redis_ttl']) if cliargs['debug'] or cliargs['verbose']: logger.info("enqueued batchsize: %s (batchsize: %s)" % (batch_len, batchsize)) del batch[:] if cliargs['adaptivebatch']: batchsize = adaptive_batch(q_crawl, cliargs, batchsize) if cliargs['debug'] or cliargs['verbose']: logger.info("batchsize set to: %s" % batchsize) # check if at maxdepth level and delete dirs/files lists to not # descend further down the tree if cliargs['maxdepth']: num_sep_this = root_path.count(os.path.sep) if num_sep + level <= num_sep_this: del dirs[:] del files[:] else: # directory excluded del dirs[:] del files[:] # update progress bar if bar: try: if time.time() - bartimestamp >= 2: elapsed = round(time.time() - bartimestamp, 3) dirspersec = round(dircount / elapsed, 3) widgets[4] = progressbar.FormatLabel(', ' + str(dirspersec) + ' dirs/sec) ') bartimestamp = time.time() dircount = 0 bar.update(len(q_crawl)) except (ZeroDivisionError, ValueError): bar.update(0) # add any remaining in batch to queue q_crawl.enqueue(scrape_tree_meta, args=(batch, cliargs, reindex_dict,), result_ttl=config['redis_ttl']) # set up progress bar with time remaining if bar: bar.finish() bar_max_val = len(q_crawl) bar = progressbar.ProgressBar(max_value=bar_max_val) bar.start() else: bar = None # update progress bar until bots are idle and queue is empty while worker_bots_busy([q_crawl]): if bar: q_len = len(q_crawl) try: bar.update(bar_max_val - q_len) except (ZeroDivisionError, ValueError): bar.update(0) time.sleep(1) if bar: bar.finish() elapsed = round(time.time() - starttime, 3) dirspersec = round(totaldirs / elapsed, 3) logger.info("Finished crawling, elapsed time %s sec, dirs walked %s (%s dirs/sec)" % (elapsed, totaldirs, dirspersec))
def dupes_finder(es, q, cliargs, logger): """This is the duplicate file finder function. It searches Elasticsearch for files that have the same filehashes and adds file hash groups to Queue. """ logger.info('Searching %s for all duplicate files...', cliargs['index']) if cliargs['adaptivebatch']: batchsize = ab_start else: batchsize = cliargs['batchsize'] if cliargs['verbose'] or cliargs['debug']: logger.info('Batch size: %s' % batchsize) # first get all the filehashes with files that have a hardlinks count of 1 data = { "size": 0, "query": { "bool": { "must": { "term": {"hardlinks": 1} }, "filter": { "range": { "filesize": { "lte": config['dupes_maxsize'], "gte": cliargs['minsize'] } } } } } } # refresh index es.indices.refresh(index=cliargs['index']) # search es and start scroll res = es.search(index=cliargs['index'], scroll='1m', doc_type='file', size=config['es_scrollsize'], body=data, request_timeout=config['es_timeout']) filehashlist = [] filehashcount = 0 while res['hits']['hits'] and len(res['hits']['hits']) > 0: for hit in res['hits']['hits']: filehash = hit['_source']['filehash'] if filehash not in filehashlist: filehashlist.append(filehash) filehashcount += 1 filehashlist_len = len(filehashlist) if filehashlist_len >= batchsize: # send to rq for bots to process file hashkey list q.enqueue(dupes_process_hashkey, args=(filehashlist, cliargs,), result_ttl=config['redis_ttl']) if cliargs['debug'] or cliargs['verbose']: logger.info("enqueued batchsize: %s (batchsize: %s)" % (filehashlist_len, batchsize)) del filehashlist[:] if cliargs['adaptivebatch']: batchsize = adaptive_batch(q, cliargs, batchsize) if cliargs['debug'] or cliargs['verbose']: logger.info("batchsize set to: %s" % batchsize) # use es scroll api res = es.scroll(scroll_id=res['_scroll_id'], scroll='1m', request_timeout=config['es_timeout']) # enqueue dir calc job for any remaining in dirlist if len(filehashlist) > 0: q.enqueue(dupes_process_hashkey, args=(filehashlist, cliargs,), result_ttl=config['redis_ttl']) logger.info('%s file hashes have been enqueued' % filehashcount) if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: bar = progress_bar('Checking') bar.start() else: bar = None # update progress bar until bots are idle and queue is empty while worker_bots_busy([q]): if bar: q_len = len(q) try: bar.update(q_len) except (ZeroDivisionError, ValueError): bar.update(0) time.sleep(1) if bar: bar.finish()
def dupes_finder(es, q, cliargs, logger): """This is the duplicate file finder function. It searches Elasticsearch for files that have the same filehashes and adds file hash groups to Queue. """ logger.info('Searching %s for all dupe filehashes...', cliargs['index']) # first get all the filehashes with files that have a hardlinks count of 1 if cliargs['inchardlinks']: data = { "size": 0, "_source": [ 'filename', 'filehash', 'path_parent', 'last_modified', 'last_access' ], "query": { "bool": { "must": { "range": { "filesize": { "lte": config['dupes_maxsize'], "gte": cliargs['minsize'] } } } } } } else: data = { "size": 0, "_source": [ 'filename', 'filehash', 'path_parent', 'last_modified', 'last_access' ], "query": { "bool": { "must": { "term": { "hardlinks": 1 } }, "filter": { "range": { "filesize": { "lte": config['dupes_maxsize'], "gte": cliargs['minsize'] } } } } } } # refresh index es.indices.refresh(index=cliargs['index']) # search es and start scroll res = es.search(index=cliargs['index'], doc_type='file', scroll='1m', size=config['es_scrollsize'], body=data, request_timeout=config['es_timeout']) filehashes = {} while res['hits']['hits'] and len(res['hits']['hits']) > 0: for hit in res['hits']['hits']: filehash = hit['_source']['filehash'] filepath = os.path.join(hit['_source']['path_parent'], hit['_source']['filename']) if filehash in filehashes: filehashes[filehash].append({ 'id': hit['_id'], 'filename': filepath, 'atime': hit['_source']['last_access'], 'mtime': hit['_source']['last_modified'], 'md5': '' }) else: filehashes[filehash] = [{ 'id': hit['_id'], 'filename': filepath, 'atime': hit['_source']['last_access'], 'mtime': hit['_source']['last_modified'], 'md5': '' }] # use es scroll api res = es.scroll(scroll_id=res['_scroll_id'], scroll='1m', request_timeout=config['es_timeout']) possibledupescount = 0 for key, value in list(filehashes.items()): filehash_filecount = len(value) if filehash_filecount < 2: del filehashes[key] else: possibledupescount += filehash_filecount logger.info('Found %s possible dupe files', possibledupescount) if possibledupescount == 0: return logger.info('Starting to enqueue dupe file hashes...') if cliargs['adaptivebatch']: batchsize = ab_start else: batchsize = cliargs['batchsize'] if cliargs['verbose'] or cliargs['debug']: logger.info('Batch size: %s' % batchsize) n = 0 hashgroups = [] for key, value in filehashes.items(): if cliargs['verbose'] or cliargs['debug']: logger.info('filehash: %s, filecount: %s' % (key, len(value))) hashgroups.append({'filehash': key, 'files': value}) n += 1 if n >= batchsize: # send to rq for bots to process hashgroups list q.enqueue(dupes_process_hashkeys, args=( hashgroups, cliargs, ), result_ttl=config['redis_ttl']) if cliargs['debug'] or cliargs['verbose']: logger.info("enqueued batchsize: %s (batchsize: %s)" % (n, batchsize)) del hashgroups[:] n = 0 if cliargs['adaptivebatch']: batchsize = adaptive_batch(q, cliargs, batchsize) if cliargs['debug'] or cliargs['verbose']: logger.info("batchsize set to: %s" % batchsize) # enqueue dir calc job for any remaining in dirlist if n > 0: q.enqueue(dupes_process_hashkeys, args=( hashgroups, cliargs, ), result_ttl=config['redis_ttl']) logger.info( '%s possible dupe file hashes have been enqueued, worker bots processing dupes...' % possibledupescount) if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: bar = progress_bar('Checking') bar.start() else: bar = None # update progress bar until bots are idle and queue is empty while worker_bots_busy([q]): if bar: q_len = len(q) try: bar.update(q_len) except (ZeroDivisionError, ValueError): bar.update(0) time.sleep(1) if bar: bar.finish()
def dupes_finder(es, q, cliargs, logger): """This is the duplicate file finder function. It searches Elasticsearch for files that have the same filehashes and adds file hash groups to Queue. """ logger.info('Searching %s for duplicate file hashes...', cliargs['index']) # find the filehashes with largest files and add filehash keys # to hashgroups data = { "size": 0, "query": { "bool": { "must": { "term": { "hardlinks": 1 } }, "filter": { "range": { "filesize": { "lte": config['dupes_maxsize'], "gte": cliargs['minsize'] } } } } }, "aggs": { "dupe_filehash": { "terms": { "field": "filehash", "min_doc_count": 2, "size": 10000, "order": { "max_file_size": "desc" } }, "aggs": { "max_file_size": { "max": { "field": "filesize" } } } } } } # refresh index es.indices.refresh(index=cliargs['index']) res = es.search(index=cliargs['index'], doc_type='file', body=data, request_timeout=config['es_timeout']) logger.info('Found %s duplicate file hashes, enqueueing...', len(res['aggregations']['dupe_filehash']['buckets'])) # add hash keys to Queue for bucket in res['aggregations']['dupe_filehash']['buckets']: q.enqueue(dupes_process_hashkey, args=( bucket['key'], cliargs, ), result_ttl=config['redis_ttl']) logger.info('All file hashes have been enqueued') if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: bar = progress_bar('Checking') bar.start() else: bar = None # update progress bar until bots are idle and queue is empty while worker_bots_busy([q]): if bar: q_len = len(q) try: bar.update(q_len) except (ZeroDivisionError, ValueError): bar.update(0) time.sleep(1) if bar: bar.finish()