def scrape_tree_meta(paths, cliargs, reindex_dict): try: global worker tree_dirs = [] tree_files = [] totalcrawltime = 0 statsembeded = False num_workers = len(SimpleWorker.all(connection=redis_conn)) path_count = 0 filenames = [] for path in paths: path_count += 1 starttime = time.time() if not cliargs['dirsonly']: root, dirs, files = path else: root, dirs = path files = [] if path_count == 1: if type(root) is tuple: statsembeded = True # check if stats embeded in data from diskover tree walk client or crawlapi if statsembeded: root_path = root[0] dmeta = get_dir_meta(worker, root, cliargs, reindex_dict, statsembeded=True) else: root_path = root dmeta = get_dir_meta(worker, root_path, cliargs, reindex_dict, statsembeded=False) if dmeta: # no files in batch, get them with scandir if cliargs['dirsonly']: for entry in scandir(root): if entry.is_file( follow_symlinks=False) and not file_excluded( entry.name): files.append(entry.name) filecount = 0 # check if the directory has a ton of files in it and farm out meta collection to other worker bots files_count = len(files) if cliargs['splitfiles'] and files_count >= cliargs[ 'splitfilesnum']: fmetas = [] for filelist in split_list(files, int(files_count / num_workers)): fmetas.append( q_crawl.enqueue(file_meta_collector, args=( filelist, root_path, statsembeded, cliargs, reindex_dict, ), result_ttl=config['redis_ttl'])) n = 0 while n < len(fmetas): if fmetas[n].result: for fmeta in fmetas[n].result: if fmeta: tree_files.append(fmeta) filecount += 1 n += 1 del fmetas[:] else: for file in files: filenames.append(file[0]) if statsembeded: fmeta = get_file_meta(worker, file, cliargs, reindex_dict, statsembeded=True) else: fmeta = get_file_meta(worker, os.path.join( root_path, file), cliargs, reindex_dict, statsembeded=False) if fmeta: tree_files.append(fmeta) filecount += 1 # update crawl time elapsed = time.time() - starttime dmeta['crawl_time'] = round(elapsed, 6) # check for empty dirs and dirsonly cli arg if cliargs['indexemptydirs']: tree_dirs.append(dmeta) elif not cliargs['indexemptydirs'] and (len(dirs) > 0 or filecount > 0): tree_dirs.append(dmeta) totalcrawltime += elapsed # check if doc count is more than es chunksize and bulk add to es if len(tree_dirs) + len(tree_files) >= config['es_chunksize']: es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime) del tree_dirs[:] del tree_files[:] totalcrawltime = 0 # bulk add to es if len(tree_dirs) > 0 or len(tree_files) > 0: es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime) print('%s | processed %d files' % (datetime.now(), len(filenames))) return True, filenames except Exception as e: print('%s | error | %s' % (datetime.now(), e)) return False, []
def dupes_finder(es, q, cliargs, logger): """This is the duplicate file finder function. It searches Elasticsearch for files that have the same filehashes and adds file hash groups to Queue. """ logger.info('Searching %s for duplicate file hashes...', cliargs['index']) # find the filehashes with largest files and add filehash keys # to hashgroups data = { "size": 0, "query": { "bool": { "must": { "term": { "hardlinks": 1 } }, "filter": { "range": { "filesize": { "lte": config['dupes_maxsize'], "gte": cliargs['minsize'] } } } } }, "aggs": { "dupe_filehash": { "terms": { "field": "filehash", "min_doc_count": 2, "size": 10000, "order": { "max_file_size": "desc" } }, "aggs": { "max_file_size": { "max": { "field": "filesize" } } } } } } # refresh index es.indices.refresh(index=cliargs['index']) res = es.search(index=cliargs['index'], doc_type='file', body=data, request_timeout=config['es_timeout']) logger.info('Found %s duplicate file hashes, enqueueing...', len(res['aggregations']['dupe_filehash']['buckets'])) # add hash keys to Queue for bucket in res['aggregations']['dupe_filehash']['buckets']: q.enqueue(dupes_process_hashkey, args=( bucket['key'], cliargs, ), result_ttl=config['redis_ttl']) logger.info('All file hashes have been enqueued') if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: bar = progress_bar('Checking') bar.start() else: bar = None # wait for queue to be empty and update progress bar time.sleep(1) while True: workers_busy = False workers = SimpleWorker.all(connection=redis_conn) for worker in workers: if worker._state == "busy": workers_busy = True break q_len = len(q) if not cliargs['quiet'] and not cliargs['debug'] and not cliargs[ 'verbose']: try: bar.update(q_len) except ZeroDivisionError: bar.update(0) except ValueError: bar.update(0) if q_len == 0 and workers_busy == False: break time.sleep(.5) if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: bar.finish()
def get_report_metrics(self): conn = self.connection worker_count = len(SimpleWorker.all(connection=conn)) fq = get_failed_queue(connection=conn) return {"queue.failed": len(fq), "queue.workers": worker_count}
def scrape_tree_meta(paths, cliargs, reindex_dict): global worker tree_dirs = [] tree_files = [] totalcrawltime = 0 num_workers = len(SimpleWorker.all(connection=redis_conn)) for path in paths: starttime = time.time() root, dirs, files = path # check if dirchunk or stats embeded in data from # diskover tree walk client or crawlapi if type(root) is tuple: if root[1] == 'dchunk': dirchunk = True statsembeded = False else: statsembeded = True dirchunk = False else: statsembeded = False dirchunk = False if statsembeded: root_path = root[0] dmeta = get_dir_meta(worker, root, cliargs, reindex_dict, statsembeded=True) else: if dirchunk: root_path = root[0] dmeta = {'chunkpath': root_path} else: root_path = root dmeta = get_dir_meta(worker, root_path, cliargs, reindex_dict, statsembeded=False) if dmeta: filecount = 0 # check if the directory has a ton of files in it and farm out meta collection to other worker bots files_count = len(files) if cliargs[ 'splitfiles'] and files_count >= cliargs['splitfilesnum']: fmetas = [] for filelist in split_list(files, int(files_count / num_workers)): fmetas.append( q_crawl.enqueue(file_meta_collector, args=( filelist, root_path, statsembeded, cliargs, reindex_dict, ), result_ttl=config['redis_ttl'])) n = 0 while n < len(fmetas): if fmetas[n].result: for fmeta in fmetas[n].result: if fmeta: tree_files.append(fmeta) filecount += 1 n += 1 else: time.sleep(.05) del fmetas[:] else: for file in files: if statsembeded: fmeta = get_file_meta(worker, file, cliargs, reindex_dict, statsembeded=True) else: fmeta = get_file_meta(worker, os.path.join(root_path, file), cliargs, reindex_dict, statsembeded=False) if fmeta: tree_files.append(fmeta) filecount += 1 # update crawl time elapsed = time.time() - starttime dmeta['crawl_time'] = round(elapsed, 6) # check for empty dirs if cliargs['indexemptydirs']: tree_dirs.append(dmeta) elif not cliargs['indexemptydirs'] and (len(dirs) > 0 or filecount > 0): tree_dirs.append(dmeta) totalcrawltime += elapsed # check if doc count is more than es chunksize and bulk add to es if len(tree_dirs) + len(tree_files) >= config['es_chunksize']: es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime) del tree_dirs[:] del tree_files[:] totalcrawltime = 0 # bulk add to es if len(tree_dirs) > 0 or len(tree_files) > 0: es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime)
def qumulo_treewalk(path, ip, ses, q_crawl, num_sep, level, batchsize, cliargs, logger, reindex_dict): batch = [] dircount = 0 totaldirs = 0 starttime = time.time() # queue for paths q_paths = PyQueue() q_paths_results = PyQueue() lock = Lock() # set up threads for tree walk for i in range(cliargs['walkthreads']): t = Thread(target=apiwalk_worker, args=( ip, ses, q_paths, q_paths_results, lock, )) t.daemon = True t.start() # set up progress bar if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: widgets = [ progressbar.AnimatedMarker(), ' Crawling (Queue: ', progressbar.Counter(), progressbar.FormatLabel(''), ') ', progressbar.Timer() ] bar = progressbar.ProgressBar(widgets=widgets, max_value=progressbar.UnknownLength) bar.start() else: bar = None bartimestamp = time.time() for root, dirs, files in qumulo_api_walk(path, ip, ses, q_paths, q_paths_results): dircount += 1 totaldirs += 1 if len(dirs) == 0 and len( files) == 0 and not cliargs['indexemptydirs']: continue if root['path'] != '/': root_path = root['path'].rstrip(os.path.sep) else: root_path = root['path'] if not dir_excluded(root_path, config, cliargs): batch.append((root, files)) batch_len = len(batch) if batch_len >= batchsize: q_crawl.enqueue(scrape_tree_meta, args=( batch, cliargs, reindex_dict, ), result_ttl=config['redis_ttl']) del batch[:] if cliargs['adaptivebatch']: batchsize = adaptive_batch(q_crawl, cliargs, batchsize) # check if at maxdepth level and delete dirs/files lists to not # descend further down the tree if cliargs['maxdepth']: num_sep_this = root_path.count(os.path.sep) if num_sep + level <= num_sep_this: del dirs[:] del files[:] else: # directory excluded del dirs[:] del files[:] # update progress bar if bar: try: if time.time() - bartimestamp >= 2: elapsed = round(time.time() - bartimestamp, 3) dirspersec = round(dircount / elapsed, 3) widgets[4] = progressbar.FormatLabel(', ' + str(dirspersec) + ' dirs/sec) ') bartimestamp = time.time() dircount = 0 bar.update(len(q_crawl)) except ZeroDivisionError: bar.update(0) except ValueError: bar.update(0) # add any remaining in batch to queue q_crawl.enqueue(scrape_tree_meta, args=( batch, cliargs, reindex_dict, ), result_ttl=config['redis_ttl']) # set up progress bar with time remaining if bar: bar.finish() bar_max_val = len(q_crawl) bar = progressbar.ProgressBar(max_value=bar_max_val) bar.start() else: bar = None # wait for queue to be empty and update progress bar while True: workers_busy = False workers = SimpleWorker.all(connection=redis_conn) for worker in workers: if worker._state == "busy": workers_busy = True break q_len = len(q_crawl) if bar: try: bar.update(bar_max_val - q_len) except ZeroDivisionError: bar.update(0) except ValueError: bar.update(0) if q_len == 0 and workers_busy == False: break time.sleep(.5) if bar: bar.finish() elapsed = round(time.time() - starttime, 3) dirspersec = round(totaldirs / elapsed, 3) logger.info( "Finished crawling, elapsed time %s sec, dirs walked %s (%s dirs/sec)" % (elapsed, totaldirs, dirspersec))