def get_files(index, path): newpath = escape_chars(path) if newpath == '\/': newpathwildcard = '\/*' else: newpathwildcard = newpath + '\/*' logger.info('Searching for all file docs in %s for path %s...', index, path) data = { '_source': [ 'path_parent', 'filename', 'last_modified', 'last_access', 'last_change' ], 'query': { 'query_string': { 'query': '(path_parent: ' + newpath + ') OR ' '(path_parent: ' + newpathwildcard + ') OR (filename: "' + os.path.basename(path) + '" AND path_parent: "' + os.path.abspath(os.path.join(path, os.pardir)) + '")', } } } es.indices.refresh(index) res = es.search(index=index, doc_type='file', scroll='1m', size=config['es_scrollsize'], body=data, request_timeout=config['es_timeout']) filelist = [] filelist_hashed = [] filelist_times = [] doccount = 0 while res['hits']['hits'] and len(res['hits']['hits']) > 0: for hit in res['hits']['hits']: fullpath = os.path.abspath( os.path.join(hit['_source']['path_parent'], hit['_source']['filename'])) mtime = time.mktime( datetime.strptime(hit['_source']['last_modified'], '%Y-%m-%dT%H:%M:%S').timetuple()) ctime = time.mktime( datetime.strptime(hit['_source']['last_change'], '%Y-%m-%dT%H:%M:%S').timetuple()) atime = time.mktime( datetime.strptime(hit['_source']['last_access'], '%Y-%m-%dT%H:%M:%S').timetuple()) filelist.append(fullpath) filelist_hashed.append( hashlib.md5(fullpath.encode('utf-8')).hexdigest()) filelist_times.append((mtime, ctime, atime)) doccount += 1 # use es scroll api res = es.scroll(scroll_id=res['_scroll_id'], scroll='1m', request_timeout=config['es_timeout']) logger.info('Found %s file docs' % str(doccount)) return filelist, filelist_hashed, filelist_times
def populate_hashgroup(key, cliargs): """Searches ES for all files matching hashgroup key (filehash) and returns dict containing matching files. Return None if only 1 file matching. """ hashgroup_files = [] data = { "_source": ["path_parent", "filename", "last_access", "last_modified", "type"], "query": { "bool": { "must": { "term": { "filehash": key } }, "filter": { "term": { "type": "file" } } } } } res = es.search(index=cliargs['index'], size="1000", body=data, request_timeout=config['es_timeout']) # return None if only 1 matching file if len(res['hits']['hits']) == 1: return None # add any hits to hashgroups for hit in res['hits']['hits']: hashgroup_files.append({ 'id': hit['_id'], 'filename': hit['_source']['path_parent'] + "/" + hit['_source']['filename'], 'atime': hit['_source']['last_access'], 'mtime': hit['_source']['last_modified'] }) # return filehash group and add to queue fhg = {'filehash': key, 'files': hashgroup_files, 'md5sum': ''} return fhg
def dupes_finder(es, q, cliargs, logger): """This is the duplicate file finder function. It searches Elasticsearch for files that have the same filehashes and adds file hash groups to Queue. """ logger.info('Searching %s for all dupe filehashes...', cliargs['index']) # first get all the filehashes with files that have a hardlinks count of 1 if cliargs['inchardlinks']: data = { "size": 0, "_source": [ 'filename', 'filehash', 'path_parent', 'last_modified', 'last_access' ], "query": { "bool": { "must": { "range": { "filesize": { "lte": config['dupes_maxsize'], "gte": cliargs['minsize'] } } } } } } else: data = { "size": 0, "_source": [ 'filename', 'filehash', 'path_parent', 'last_modified', 'last_access' ], "query": { "bool": { "must": { "term": { "hardlinks": 1 } }, "filter": { "range": { "filesize": { "lte": config['dupes_maxsize'], "gte": cliargs['minsize'] } } } } } } # refresh index es.indices.refresh(index=cliargs['index']) # search es and start scroll res = es.search(index=cliargs['index'], doc_type='file', scroll='1m', size=config['es_scrollsize'], body=data, request_timeout=config['es_timeout']) filehashes = {} while res['hits']['hits'] and len(res['hits']['hits']) > 0: for hit in res['hits']['hits']: filehash = hit['_source']['filehash'] filepath = os.path.join(hit['_source']['path_parent'], hit['_source']['filename']) if filehash in filehashes: filehashes[filehash].append({ 'id': hit['_id'], 'filename': filepath, 'atime': hit['_source']['last_access'], 'mtime': hit['_source']['last_modified'], 'md5': '' }) else: filehashes[filehash] = [{ 'id': hit['_id'], 'filename': filepath, 'atime': hit['_source']['last_access'], 'mtime': hit['_source']['last_modified'], 'md5': '' }] # use es scroll api res = es.scroll(scroll_id=res['_scroll_id'], scroll='1m', request_timeout=config['es_timeout']) possibledupescount = 0 for key, value in list(filehashes.items()): filehash_filecount = len(value) if filehash_filecount < 2: del filehashes[key] else: possibledupescount += filehash_filecount logger.info('Found %s possible dupe files', possibledupescount) if possibledupescount == 0: return logger.info('Starting to enqueue dupe file hashes...') if cliargs['adaptivebatch']: batchsize = ab_start else: batchsize = cliargs['batchsize'] if cliargs['verbose'] or cliargs['debug']: logger.info('Batch size: %s' % batchsize) n = 0 hashgroups = [] for key, value in filehashes.items(): if cliargs['verbose'] or cliargs['debug']: logger.info('filehash: %s, filecount: %s' % (key, len(value))) hashgroups.append({'filehash': key, 'files': value}) n += 1 if n >= batchsize: # send to rq for bots to process hashgroups list q.enqueue(dupes_process_hashkeys, args=( hashgroups, cliargs, ), result_ttl=config['redis_ttl']) if cliargs['debug'] or cliargs['verbose']: logger.info("enqueued batchsize: %s (batchsize: %s)" % (n, batchsize)) del hashgroups[:] n = 0 if cliargs['adaptivebatch']: batchsize = adaptive_batch(q, cliargs, batchsize) if cliargs['debug'] or cliargs['verbose']: logger.info("batchsize set to: %s" % batchsize) # enqueue dir calc job for any remaining in dirlist if n > 0: q.enqueue(dupes_process_hashkeys, args=( hashgroups, cliargs, ), result_ttl=config['redis_ttl']) logger.info( '%s possible dupe file hashes have been enqueued, worker bots processing dupes...' % possibledupescount) if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: bar = progress_bar('Checking') bar.start() else: bar = None # update progress bar until bots are idle and queue is empty while worker_bots_busy([q]): if bar: q_len = len(q) try: bar.update(q_len) except (ZeroDivisionError, ValueError): bar.update(0) time.sleep(1) if bar: bar.finish()
def dupes_finder(es, q, cliargs, logger): """This is the duplicate file finder function. It searches Elasticsearch for files that have the same filehashes and adds file hash groups to Queue. """ logger.info('Searching %s for duplicate file hashes...', cliargs['index']) # find the filehashes with largest files and add filehash keys # to hashgroups data = { "size": 0, "query": { "bool": { "must": { "term": { "hardlinks": 1 } }, "filter": { "range": { "filesize": { "lte": config['dupes_maxsize'], "gte": cliargs['minsize'] } } } } }, "aggs": { "dupe_filehash": { "terms": { "field": "filehash", "min_doc_count": 2, "size": 10000, "order": { "max_file_size": "desc" } }, "aggs": { "max_file_size": { "max": { "field": "filesize" } } } } } } # refresh index es.indices.refresh(index=cliargs['index']) res = es.search(index=cliargs['index'], doc_type='file', body=data, request_timeout=config['es_timeout']) logger.info('Found %s duplicate file hashes, enqueueing...', len(res['aggregations']['dupe_filehash']['buckets'])) # add hash keys to Queue for bucket in res['aggregations']['dupe_filehash']['buckets']: q.enqueue(dupes_process_hashkey, args=( bucket['key'], cliargs, ), result_ttl=config['redis_ttl']) logger.info('All file hashes have been enqueued') if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: bar = progress_bar('Checking') bar.start() else: bar = None # wait for queue to be empty and update progress bar time.sleep(1) while True: workers_busy = False workers = SimpleWorker.all(connection=redis_conn) for worker in workers: if worker._state == "busy": workers_busy = True break q_len = len(q) if not cliargs['quiet'] and not cliargs['debug'] and not cliargs[ 'verbose']: try: bar.update(q_len) except ZeroDivisionError: bar.update(0) except ValueError: bar.update(0) if q_len == 0 and workers_busy == False: break time.sleep(.5) if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: bar.finish()
def dupes_finder(es, q, cliargs, logger): """This is the duplicate file finder function. It searches Elasticsearch for files that have the same filehashes and adds file hash groups to Queue. """ logger.info('Searching %s for all duplicate files...', cliargs['index']) if cliargs['adaptivebatch']: batchsize = ab_start else: batchsize = cliargs['batchsize'] if cliargs['verbose'] or cliargs['debug']: logger.info('Batch size: %s' % batchsize) # first get all the filehashes with files that have a hardlinks count of 1 data = { "size": 0, "query": { "bool": { "must": { "term": {"hardlinks": 1} }, "filter": { "range": { "filesize": { "lte": config['dupes_maxsize'], "gte": cliargs['minsize'] } } } } } } # refresh index es.indices.refresh(index=cliargs['index']) # search es and start scroll res = es.search(index=cliargs['index'], scroll='1m', doc_type='file', size=config['es_scrollsize'], body=data, request_timeout=config['es_timeout']) filehashlist = [] filehashcount = 0 while res['hits']['hits'] and len(res['hits']['hits']) > 0: for hit in res['hits']['hits']: filehash = hit['_source']['filehash'] if filehash not in filehashlist: filehashlist.append(filehash) filehashcount += 1 filehashlist_len = len(filehashlist) if filehashlist_len >= batchsize: # send to rq for bots to process file hashkey list q.enqueue(dupes_process_hashkey, args=(filehashlist, cliargs,), result_ttl=config['redis_ttl']) if cliargs['debug'] or cliargs['verbose']: logger.info("enqueued batchsize: %s (batchsize: %s)" % (filehashlist_len, batchsize)) del filehashlist[:] if cliargs['adaptivebatch']: batchsize = adaptive_batch(q, cliargs, batchsize) if cliargs['debug'] or cliargs['verbose']: logger.info("batchsize set to: %s" % batchsize) # use es scroll api res = es.scroll(scroll_id=res['_scroll_id'], scroll='1m', request_timeout=config['es_timeout']) # enqueue dir calc job for any remaining in dirlist if len(filehashlist) > 0: q.enqueue(dupes_process_hashkey, args=(filehashlist, cliargs,), result_ttl=config['redis_ttl']) logger.info('%s file hashes have been enqueued' % filehashcount) if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']: bar = progress_bar('Checking') bar.start() else: bar = None # update progress bar until bots are idle and queue is empty while worker_bots_busy([q]): if bar: q_len = len(q) try: bar.update(q_len) except (ZeroDivisionError, ValueError): bar.update(0) time.sleep(1) if bar: bar.finish()