def get_files(index, path): newpath = escape_chars(path) if newpath == '\/': newpathwildcard = '\/*' else: newpathwildcard = newpath + '\/*' logger.info('Searching for all file docs in %s for path %s...', index, path) data = { '_source': [ 'path_parent', 'filename', 'last_modified', 'last_access', 'last_change' ], 'query': { 'query_string': { 'query': '(path_parent: ' + newpath + ') OR ' '(path_parent: ' + newpathwildcard + ') OR (filename: "' + os.path.basename(path) + '" AND path_parent: "' + os.path.abspath(os.path.join(path, os.pardir)) + '")', } } } es.indices.refresh(index) res = es.search(index=index, doc_type='file', scroll='1m', size=config['es_scrollsize'], body=data, request_timeout=config['es_timeout']) filelist = [] filelist_hashed = [] filelist_times = [] doccount = 0 while res['hits']['hits'] and len(res['hits']['hits']) > 0: for hit in res['hits']['hits']: fullpath = os.path.abspath( os.path.join(hit['_source']['path_parent'], hit['_source']['filename'])) mtime = time.mktime( datetime.strptime(hit['_source']['last_modified'], '%Y-%m-%dT%H:%M:%S').timetuple()) ctime = time.mktime( datetime.strptime(hit['_source']['last_change'], '%Y-%m-%dT%H:%M:%S').timetuple()) atime = time.mktime( datetime.strptime(hit['_source']['last_access'], '%Y-%m-%dT%H:%M:%S').timetuple()) filelist.append(fullpath) filelist_hashed.append( hashlib.md5(fullpath.encode('utf-8')).hexdigest()) filelist_times.append((mtime, ctime, atime)) doccount += 1 # use es scroll api res = es.scroll(scroll_id=res['_scroll_id'], scroll='1m', request_timeout=config['es_timeout']) logger.info('Found %s file docs' % str(doccount)) return filelist, filelist_hashed, filelist_times
def get_metadata(path, cliargs): dir_source = "" filename = diskover.escape_chars(os.path.basename(path)) parent_dir = diskover.escape_chars( os.path.abspath(os.path.join(path, os.pardir))) fullpath = diskover.escape_chars(os.path.abspath(path)) data = { "size": 1, "query": { "query_string": { "query": "filename: " + filename + " AND path_parent: " + parent_dir } } } res = es.search(index=cliargs['index2'], doc_type='directory', body=data, request_timeout=diskover.config['es_timeout']) try: dir_source = res['hits']['hits'][0]['_source'] except IndexError: pass data = {"query": {"query_string": {"query": "path_parent: " + fullpath}}} files_source = [] res = es.search(index=cliargs['index2'], doc_type='file', scroll='1m', size=1000, body=data, request_timeout=diskover.config['es_timeout']) while res['hits']['hits'] and len(res['hits']['hits']) > 0: for hit in res['hits']['hits']: files_source.append(hit['_source']) # get es scroll id scroll_id = res['_scroll_id'] # use es scroll api res = es.scroll(scroll_id=scroll_id, scroll='1m', request_timeout=diskover.config['es_timeout']) return dir_source, files_source
def calc_dir_size(dirlist, cliargs): """This is the calculate directory size worker function. It gets a directory list from the Queue search ES for all files in each directory (recursive) and sums their filesizes to create a total filesize and item count for each dir, then pdates dir doc's filesize and items fields. """ doclist = [] for path in dirlist: totalsize = 0 totalitems = 1 # 1 for itself totalitems_files = 0 totalitems_subdirs = 0 # file doc search with aggregate for sum filesizes # escape special characters newpath = escape_chars(path[1]) # create wildcard string and check for / (root) path if newpath == '\/': newpathwildcard = '\/*' else: newpathwildcard = newpath + '\/*' # check if / (root) path if newpath == '\/': data = { "size": 0, "query": { "query_string": { "query": "path_parent: " + newpath + "*", "analyze_wildcard": "true" } }, "aggs": { "total_size": { "sum": { "field": "filesize" } } } } else: data = { "size": 0, "query": { "query_string": { 'query': 'path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard, 'analyze_wildcard': 'true' } }, "aggs": { "total_size": { "sum": { "field": "filesize" } } } } # search ES and start scroll res = es.search(index=cliargs['index'], doc_type='file', body=data, request_timeout=config['es_timeout']) # total items sum totalitems_files += res['hits']['total'] # total file size sum totalsize += res['aggregations']['total_size']['value'] # directory doc search (subdirs) # check if / (root) path if newpath == '\/': data = { "size": 0, "query": { "query_string": { "query": "path_parent: " + newpath + "*", "analyze_wildcard": "true" } } } else: data = { "size": 0, "query": { "query_string": { 'query': 'path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard, 'analyze_wildcard': 'true' } } } # search ES and start scroll res = es.search(index=cliargs['index'], doc_type='directory', body=data, request_timeout=config['es_timeout']) # total items sum totalitems_subdirs += res['hits']['total'] # total items totalitems += totalitems_files + totalitems_subdirs # update filesize and items fields for directory (path) doc d = { '_op_type': 'update', '_index': cliargs['index'], '_type': 'directory', '_id': path[0], 'doc': {'filesize': totalsize, 'items': totalitems, 'items_files': totalitems_files, 'items_subdirs': totalitems_subdirs} } # add total cost per gb to doc if cliargs['costpergb']: d = cost_per_gb(d, path[1], path[2], path[3], path[4], 'directory') doclist.append(d) index_bulk_add(es, doclist, config, cliargs)
def calc_dir_size(dirlist, cliargs): """This is the calculate directory size worker function. It gets a directory list from the Queue and searches ES for all files in each directory (recursive) and sums their filesizes to create a total filesize and item count for each dir. Updates dir doc's filesize and items fields. """ bot_logger = bot_log_setup(cliargs) jobstart = time.time() bot_logger.info('*** Calculating directory sizes...') for path in dirlist: totalsize = 0 totalitems = 1 # itself # file doc search with aggregate for sum filesizes # escape special characters newpath = diskover.escape_chars(path[1]) # create wildcard string and check for / (root) path if newpath == '\/': newpathwildcard = '\/*' else: newpathwildcard = newpath + '\/*' # check if / (root) path if newpath == '\/': data = { "size": 0, "query": { "query_string": { "query": "path_parent: " + newpath + "*", "analyze_wildcard": "true" } }, "aggs": { "total_size": { "sum": { "field": "filesize" } } } } else: data = { "size": 0, "query": { "query_string": { 'query': 'path_parent: ' + newpath + ' ' 'OR path_parent: ' + newpathwildcard, 'analyze_wildcard': 'true' } }, "aggs": { "total_size": { "sum": { "field": "filesize" } } } } # search ES and start scroll res = es.search(index=cliargs['index'], doc_type='file', body=data, request_timeout=diskover.config['es_timeout']) # total items sum totalitems += res['hits']['total'] # total file size sum totalsize += res['aggregations']['total_size']['value'] # directory doc search (subdirs) # search ES and start scroll res = es.search(index=cliargs['index'], doc_type='directory', body=data, request_timeout=diskover.config['es_timeout']) # total items sum totalitems += res['hits']['total'] # ES id of directory doc directoryid = path[0] # update filesize and items fields for directory (path) doc es.update(index=cliargs['index'], id=directoryid, doc_type='directory', body={"doc": { 'filesize': totalsize, 'items': totalitems }}) elapsed_time = round(time.time() - jobstart, 3) bot_logger.info('*** FINISHED CALC DIR, Elapsed Time: ' + str(elapsed_time))
def get_files_gen(eshost, esver7, index, path): newpath = escape_chars(path) if newpath == '\/': newpathwildcard = '\/*' else: newpathwildcard = newpath + '\/*' logger.info('Searching for all file docs in %s for path %s...', index, path) eshost.indices.refresh(index) if esver7: data = { '_source': [ 'path_parent', 'filename', 'filesize', 'last_modified', 'last_access', 'last_change' ], 'query': { 'query_string': { 'query': '((path_parent: ' + newpath + ') OR ' '(path_parent: ' + newpathwildcard + ') OR (filename: "' + os.path.basename(path) + '" AND path_parent: "' + os.path.abspath(os.path.join(path, os.pardir)) + '")) AND type:file', } } } res = eshost.search(index=index, scroll='1m', size=config['es_scrollsize'], body=data, request_timeout=config['es_timeout']) else: data = { '_source': [ 'path_parent', 'filename', 'filesize', 'last_modified', 'last_access', 'last_change' ], 'query': { 'query_string': { 'query': '(path_parent: ' + newpath + ') OR ' '(path_parent: ' + newpathwildcard + ') OR (filename: "' + os.path.basename(path) + '" AND path_parent: "' + os.path.abspath(os.path.join(path, os.pardir)) + '")', } } } res = eshost.search(index=index, doc_type='file', scroll='1m', size=config['es_scrollsize'], body=data, request_timeout=config['es_timeout']) while res['hits']['hits'] and len(res['hits']['hits']) > 0: for hit in res['hits']['hits']: fullpath = os.path.abspath( os.path.join(hit['_source']['path_parent'], hit['_source']['filename'])) size = hit['_source']['filesize'] if args['rootdir2'] != args['rootdir']: fullpath_rep = replace_path(fullpath, args['rootdir2'], args['rootdir']) file_hashed = hashlib.md5(fullpath_rep.encode('utf-8')).hexdigest() mtime = time.mktime( datetime.strptime(hit['_source']['last_modified'], '%Y-%m-%dT%H:%M:%S').timetuple()) ctime = time.mktime( datetime.strptime(hit['_source']['last_change'], '%Y-%m-%dT%H:%M:%S').timetuple()) atime = time.mktime( datetime.strptime(hit['_source']['last_access'], '%Y-%m-%dT%H:%M:%S').timetuple()) yield fullpath, file_hashed, size, mtime, ctime, atime # use es scroll api res = eshost.scroll(scroll_id=res['_scroll_id'], scroll='1m', request_timeout=config['es_timeout'])
def calc_dir_size(dirlist, cliargs): """This is the calculate directory size worker function. It gets a directory list from the Queue and searches ES for all subdirs in each directory (recursive) and sums their filesize and items fields to create a total filesize and item count for each directory doc. Updates directory doc's filesize and items fields. """ # check if other bots are idle and throw them some jobs (dir paths) if len(dirlist) >= cliargs['batchsize']: workers_idle = 0 workers = Worker.all(connection=redis_conn) num_workers = len(workers) for w in workers: if w._state == "idle": workers_idle += 1 if workers_idle > num_workers // 2: workers_idle = True break q_len = len(q_calc) if q_len == 0 and workers_idle == True: # take half the paths randomly shuffle(dirlist) n = len(dirlist) // 2 tossdirs = dirlist[:n] dirlist = dirlist[n:] q_crawl.enqueue(calc_dir_size, args=( tossdirs, cliargs, )) doclist = [] for path in dirlist: totalitems = 1 # 1 for itself # file doc search with aggregate for sum filesizes # escape special characters newpath = escape_chars(path[1]) parentpath = escape_chars( os.path.abspath(os.path.join(path[1], os.pardir))) pathbasename = escape_chars(os.path.basename(path[1])) # create wildcard string and check for / (root) path if newpath == '\/': newpathwildcard = '\/*' else: newpathwildcard = newpath + '\/*' # check if / (root) path if newpath == '\/': data = { "size": 0, "query": { "query_string": { "query": "path_parent: " + newpath + "*", "analyze_wildcard": "true" } }, "aggs": { "total_size": { "sum": { "field": "filesize" } } } } else: data = { "size": 0, "query": { "query_string": { 'query': '(path_parent: ' + parentpath + ' AND filename: ' + pathbasename + ') OR path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard, 'analyze_wildcard': 'true' } }, "aggs": { "total_size": { "sum": { "field": "filesize" } }, "total_files": { "sum": { "field": "items_files" } }, "total_subdirs": { "sum": { "field": "items_subdirs" } } } } # search ES and start scroll for all directory doc search (subdirs) res = es.search(index=cliargs['index'], doc_type='directory', body=data, request_timeout=config['es_timeout']) # total file size sum totalsize = res['aggregations']['total_size']['value'] # total items sum for all subdirs count totalitems_subdirs = res['aggregations']['total_subdirs']['value'] # total items sum for all files count totalitems_files = res['aggregations']['total_files']['value'] totalitems += totalitems_subdirs + totalitems_files # update filesize and items fields for directory (path) doc d = { '_op_type': 'update', '_index': cliargs['index'], '_type': 'directory', '_id': path[0], 'doc': { 'filesize': totalsize, 'items': totalitems, 'items_files': totalitems_files, 'items_subdirs': totalitems_subdirs } } doclist.append(d) index_bulk_add(es, doclist, config, cliargs)
def calc_dir_size(dirlist, cliargs): """This is the calculate directory size worker function. It gets a directory list from the Queue and searches ES for all subdirs in each directory (recursive) and sums their filesize and items fields to create a total filesize and item count for each directory doc. Updates directory doc's filesize and items fields. """ doclist = [] for path in dirlist: totalitems = 1 # 1 for itself # file doc search with aggregate for sum filesizes # escape special characters newpath = escape_chars(path[1]) parentpath = escape_chars( os.path.abspath(os.path.join(path[1], os.pardir))) pathbasename = escape_chars(os.path.basename(path[1])) # create wildcard string and check for / (root) path if newpath == '\/': newpathwildcard = '\/*' else: newpathwildcard = newpath + '\/*' # check if / (root) path if newpath == '\/': data = { "size": 0, "query": { "query_string": { "query": "path_parent: " + newpath + "*", "analyze_wildcard": "true" } }, "aggs": { "total_size": { "sum": { "field": "filesize" } } } } else: data = { "size": 0, "query": { "query_string": { 'query': '(path_parent: ' + parentpath + ' AND filename: ' + pathbasename + ') OR path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard, 'analyze_wildcard': 'true' } }, "aggs": { "total_size": { "sum": { "field": "filesize" } }, "total_files": { "sum": { "field": "items_files" } }, "total_subdirs": { "sum": { "field": "items_subdirs" } } } } # search ES and start scroll for all directory doc search (subdirs) res = es.search(index=cliargs['index'], doc_type='directory', body=data, request_timeout=config['es_timeout']) # total file size sum totalsize = res['aggregations']['total_size']['value'] # total items sum for all subdirs count totalitems_subdirs = res['aggregations']['total_subdirs']['value'] # total items sum for all files count totalitems_files = res['aggregations']['total_files']['value'] totalitems += totalitems_subdirs + totalitems_files # update filesize and items fields for directory (path) doc d = { '_op_type': 'update', '_index': cliargs['index'], '_type': 'directory', '_id': path[0], 'doc': { 'filesize': totalsize, 'items': totalitems, 'items_files': totalitems_files, 'items_subdirs': totalitems_subdirs } } doclist.append(d) index_bulk_add(es, doclist, config, cliargs)