def start_importing(es, cliargs, logger): """Start importing s3 inventory file function. """ for i in range(4): thread = Thread(target=csv_file_reader, args=(s3queue, )) thread.daemon = True thread.start() # start importing S3 inventory file(s) inventory_files = cliargs['s3'] logger.info('Importing %s S3 inventory file(s)...' % len(inventory_files)) # add fake disk space to index with path set to /s3 data = { "path": '/s3', "total": 0, "used": 0, "free": 0, "available": 0, "indexing_date": datetime.utcnow().isoformat() } es.index(index=cliargs['index'], doc_type='diskspace', body=data) # add all s3 inventory files to queue for file in inventory_files: s3queue.put((file, cliargs)) # set up progress bar bar = progress_bar('Importing') bar.start() if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: i = 1 while s3queue.qsize() > 0: try: percent = int("{0:.0f}".format( 100 * ((len(inventory_files) - s3queue.qsize()) / float(len(inventory_files))))) bar.update(percent) except ZeroDivisionError: bar.update(0) except ValueError: bar.update(0) time.sleep(.5) i += 1 bar.finish() # wait for queue to be empty s3queue.join()
def dupes_finder(es, q, cliargs, logger): """This is the duplicate file finder function. It searches Elasticsearch for files that have the same filehashes and adds file hash groups to Queue. """ logger.info('Searching %s for all dupe filehashes...', cliargs['index']) # first get all the filehashes with files that have a hardlinks count of 1 if cliargs['inchardlinks']: data = { "size": 0, "_source": [ 'filename', 'filehash', 'path_parent', 'last_modified', 'last_access' ], "query": { "bool": { "must": { "range": { "filesize": { "lte": config['dupes_maxsize'], "gte": cliargs['minsize'] } } } } } } else: data = { "size": 0, "_source": [ 'filename', 'filehash', 'path_parent', 'last_modified', 'last_access' ], "query": { "bool": { "must": { "term": { "hardlinks": 1 } }, "filter": { "range": { "filesize": { "lte": config['dupes_maxsize'], "gte": cliargs['minsize'] } } } } } } # refresh index es.indices.refresh(index=cliargs['index']) # search es and start scroll res = es.search(index=cliargs['index'], doc_type='file', scroll='1m', size=config['es_scrollsize'], body=data, request_timeout=config['es_timeout']) filehashes = {} while res['hits']['hits'] and len(res['hits']['hits']) > 0: for hit in res['hits']['hits']: filehash = hit['_source']['filehash'] filepath = os.path.join(hit['_source']['path_parent'], hit['_source']['filename']) if filehash in filehashes: filehashes[filehash].append({ 'id': hit['_id'], 'filename': filepath, 'atime': hit['_source']['last_access'], 'mtime': hit['_source']['last_modified'], 'md5': '' }) else: filehashes[filehash] = [{ 'id': hit['_id'], 'filename': filepath, 'atime': hit['_source']['last_access'], 'mtime': hit['_source']['last_modified'], 'md5': '' }] # use es scroll api res = es.scroll(scroll_id=res['_scroll_id'], scroll='1m', request_timeout=config['es_timeout']) possibledupescount = 0 for key, value in list(filehashes.items()): filehash_filecount = len(value) if filehash_filecount < 2: del filehashes[key] else: possibledupescount += filehash_filecount logger.info('Found %s possible dupe files', possibledupescount) if possibledupescount == 0: return logger.info('Starting to enqueue dupe file hashes...') if cliargs['adaptivebatch']: batchsize = ab_start else: batchsize = cliargs['batchsize'] if cliargs['verbose'] or cliargs['debug']: logger.info('Batch size: %s' % batchsize) n = 0 hashgroups = [] for key, value in filehashes.items(): if cliargs['verbose'] or cliargs['debug']: logger.info('filehash: %s, filecount: %s' % (key, len(value))) hashgroups.append({'filehash': key, 'files': value}) n += 1 if n >= batchsize: # send to rq for bots to process hashgroups list q.enqueue(dupes_process_hashkeys, args=( hashgroups, cliargs, ), result_ttl=config['redis_ttl']) if cliargs['debug'] or cliargs['verbose']: logger.info("enqueued batchsize: %s (batchsize: %s)" % (n, batchsize)) del hashgroups[:] n = 0 if cliargs['adaptivebatch']: batchsize = adaptive_batch(q, cliargs, batchsize) if cliargs['debug'] or cliargs['verbose']: logger.info("batchsize set to: %s" % batchsize) # enqueue dir calc job for any remaining in dirlist if n > 0: q.enqueue(dupes_process_hashkeys, args=( hashgroups, cliargs, ), result_ttl=config['redis_ttl']) logger.info( '%s possible dupe file hashes have been enqueued, worker bots processing dupes...' % possibledupescount) if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: bar = progress_bar('Checking') bar.start() else: bar = None # update progress bar until bots are idle and queue is empty while worker_bots_busy([q]): if bar: q_len = len(q) try: bar.update(q_len) except (ZeroDivisionError, ValueError): bar.update(0) time.sleep(1) if bar: bar.finish()
def start_importing(es, cliargs, logger): """Start importing s3 inventory file function. """ for i in range(4): thread = Thread(target=csv_file_reader, args=(s3queue, )) thread.daemon = True thread.start() # start importing S3 inventory file(s) inventory_files = cliargs['s3'] logger.info('Importing %s S3 inventory file(s)...' % len(inventory_files)) # add fake disk space to index with path set to /s3 data = { "path": '/s3', "total": 0, "used": 0, "free": 0, "available": 0, "indexing_date": datetime.utcnow().isoformat() } es.index(index=cliargs['index'], doc_type='diskspace', body=data) # create fake root directory doc time_utc_now = datetime.utcnow().isoformat() time_utc_epoch_start = "1970-01-01T00:00:00" root_dict = {} root_dict['filename'] = "s3" root_dict['path_parent'] = "/" root_dict["filesize"] = 0 root_dict["items"] = 1 # 1 for itself root_dict["items_files"] = 0 root_dict["items_subdirs"] = 0 root_dict["last_modified"] = time_utc_epoch_start root_dict["tag"] = "" root_dict["tag_custom"] = "" root_dict["indexing_date"] = time_utc_now root_dict["worker_name"] = "main" root_dict["change_percent_filesize"] = "" root_dict["change_percent_items"] = "" root_dict["change_percent_items_files"] = "" root_dict["change_percent_items_subdirs"] = "" es.index(index=cliargs['index'], doc_type='directory', body=root_dict) diskover.add_crawl_stats(es, cliargs['index'], '/s3', 0) # add all s3 inventory files to queue for file in inventory_files: s3queue.put((file, cliargs)) # set up progress bar bar = diskover.progress_bar('Importing') bar.start() if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: i = 1 while s3queue.qsize() > 0: try: percent = int("{0:.0f}".format( 100 * ((len(inventory_files) - s3queue.qsize()) / float(len(inventory_files))))) bar.update(percent) except ZeroDivisionError: bar.update(0) except ValueError: bar.update(0) time.sleep(.5) i += 1 bar.finish() # wait for queue to be empty s3queue.join()
def dupes_finder(es, q, cliargs, logger): """This is the duplicate file finder function. It searches Elasticsearch for files that have the same filehashes and adds file hash groups to Queue. """ logger.info('Searching %s for duplicate file hashes...', cliargs['index']) # find the filehashes with largest files and add filehash keys # to hashgroups data = { "size": 0, "query": { "bool": { "must": { "term": { "hardlinks": 1 } }, "filter": { "range": { "filesize": { "lte": config['dupes_maxsize'], "gte": cliargs['minsize'] } } } } }, "aggs": { "dupe_filehash": { "terms": { "field": "filehash", "min_doc_count": 2, "size": 10000, "order": { "max_file_size": "desc" } }, "aggs": { "max_file_size": { "max": { "field": "filesize" } } } } } } # refresh index es.indices.refresh(index=cliargs['index']) res = es.search(index=cliargs['index'], doc_type='file', body=data, request_timeout=config['es_timeout']) logger.info('Found %s duplicate file hashes, enqueueing...', len(res['aggregations']['dupe_filehash']['buckets'])) # add hash keys to Queue for bucket in res['aggregations']['dupe_filehash']['buckets']: q.enqueue(dupes_process_hashkey, args=( bucket['key'], cliargs, ), result_ttl=config['redis_ttl']) logger.info('All file hashes have been enqueued') if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: bar = progress_bar('Checking') bar.start() else: bar = None # wait for queue to be empty and update progress bar time.sleep(1) while True: workers_busy = False workers = SimpleWorker.all(connection=redis_conn) for worker in workers: if worker._state == "busy": workers_busy = True break q_len = len(q) if not cliargs['quiet'] and not cliargs['debug'] and not cliargs[ 'verbose']: try: bar.update(q_len) except ZeroDivisionError: bar.update(0) except ValueError: bar.update(0) if q_len == 0 and workers_busy == False: break time.sleep(.5) if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: bar.finish()
def dupes_finder(es, q, cliargs, logger): """This is the duplicate file finder function. It searches Elasticsearch for files that have the same filehashes and adds file hash groups to Queue. """ logger.info('Searching %s for all duplicate files...', cliargs['index']) if cliargs['adaptivebatch']: batchsize = ab_start else: batchsize = cliargs['batchsize'] if cliargs['verbose'] or cliargs['debug']: logger.info('Batch size: %s' % batchsize) # first get all the filehashes with files that have a hardlinks count of 1 data = { "size": 0, "query": { "bool": { "must": { "term": {"hardlinks": 1} }, "filter": { "range": { "filesize": { "lte": config['dupes_maxsize'], "gte": cliargs['minsize'] } } } } } } # refresh index es.indices.refresh(index=cliargs['index']) # search es and start scroll res = es.search(index=cliargs['index'], scroll='1m', doc_type='file', size=config['es_scrollsize'], body=data, request_timeout=config['es_timeout']) filehashlist = [] filehashcount = 0 while res['hits']['hits'] and len(res['hits']['hits']) > 0: for hit in res['hits']['hits']: filehash = hit['_source']['filehash'] if filehash not in filehashlist: filehashlist.append(filehash) filehashcount += 1 filehashlist_len = len(filehashlist) if filehashlist_len >= batchsize: # send to rq for bots to process file hashkey list q.enqueue(dupes_process_hashkey, args=(filehashlist, cliargs,), result_ttl=config['redis_ttl']) if cliargs['debug'] or cliargs['verbose']: logger.info("enqueued batchsize: %s (batchsize: %s)" % (filehashlist_len, batchsize)) del filehashlist[:] if cliargs['adaptivebatch']: batchsize = adaptive_batch(q, cliargs, batchsize) if cliargs['debug'] or cliargs['verbose']: logger.info("batchsize set to: %s" % batchsize) # use es scroll api res = es.scroll(scroll_id=res['_scroll_id'], scroll='1m', request_timeout=config['es_timeout']) # enqueue dir calc job for any remaining in dirlist if len(filehashlist) > 0: q.enqueue(dupes_process_hashkey, args=(filehashlist, cliargs,), result_ttl=config['redis_ttl']) logger.info('%s file hashes have been enqueued' % filehashcount) if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: bar = progress_bar('Checking') bar.start() else: bar = None # update progress bar until bots are idle and queue is empty while worker_bots_busy([q]): if bar: q_len = len(q) try: bar.update(q_len) except (ZeroDivisionError, ValueError): bar.update(0) time.sleep(1) if bar: bar.finish()