def es_bulk_adder(worker_name, docs, cliargs, totalcrawltime=None): starttime = time.time() if not cliargs['s3']: bot_logger.info('*** Bulk adding to ES index...') try: dirlist, filelist = docs diskover.index_bulk_add(es, dirlist, diskover.config, cliargs) diskover.index_bulk_add(es, filelist, diskover.config, cliargs) except ValueError: diskover.index_bulk_add(es, docs, diskover.config, cliargs) if not cliargs['reindex'] and not cliargs['reindexrecurs'] and not cliargs[ 'crawlbot']: data = { "worker_name": worker_name, "dir_count": len(dirlist), "file_count": len(filelist), "bulk_time": round(time.time() - starttime, 6), "crawl_time": round(totalcrawltime, 6), "indexing_date": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f") } es.index(index=cliargs['index'], doc_type='worker', body=data) if not cliargs['s3']: elapsed_time = round(time.time() - starttime, 6) bot_logger.info('*** FINISHED BULK ADDING, Elapsed Time: ' + str(elapsed_time))
def index_dupes(hashgroup, cliargs): """This is the ES dupe_md5 tag update function. It updates a file's dupe_md5 field to be md5sum of file if it's marked as a duplicate. """ bot_logger = diskover_worker_bot.bot_logger # create Elasticsearch connection es = diskover.elasticsearch_connect(diskover.config) file_id_list = [] # bulk update data in Elasticsearch index for f in hashgroup['files']: d = { '_op_type': 'update', '_index': cliargs['index'], '_type': 'file', '_id': f['id'], 'doc': {'dupe_md5': hashgroup['md5sum']} } file_id_list.append(d) if len(file_id_list) > 0: if cliargs['verbose']: bot_logger.info('Bulk updating %s files in ES index' % len(file_id_list)) diskover.index_bulk_add(es, file_id_list, 'file', diskover.config, cliargs)
def es_bulk_adder(result, cliargs, bot_logger): worker_name = get_worker_name() starttime = time.time() dirlist = [] filelist = [] crawltimelist = [] totalcrawltime = 0 for item in result: if item[0] == 'directory': dirlist.append(item[1]) elif item[0] == 'file': filelist.append(item[1]) elif item[0] == 'crawltime': crawltimelist.append(item) totalcrawltime += item[2] bot_logger.info('*** Bulk adding to ES index...') diskover.index_bulk_add(es, dirlist, 'directory', diskover.config, cliargs) diskover.index_bulk_add(es, filelist, 'file', diskover.config, cliargs) if not cliargs['reindex'] and not cliargs['reindexrecurs'] and not cliargs[ 'crawlbot']: diskover.add_crawl_stats_bulk(es, crawltimelist, worker_name, diskover.config, cliargs) data = { "worker_name": worker_name, "dir_count": len(dirlist), "file_count": len(filelist), "bulk_time": round(time.time() - starttime, 10), "crawl_time": round(totalcrawltime, 10), "indexing_date": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f") } es.index(index=cliargs['index'], doc_type='worker', body=data) elapsed_time = round(time.time() - starttime, 3) bot_logger.info('*** FINISHED BULK ADDING, Elapsed Time: ' + str(elapsed_time))
def calc_dir_size(dirlist, cliargs): """This is the calculate directory size worker function. It gets a directory list from the Queue search ES for all files in each directory (recursive) and sums their filesizes to create a total filesize and item count for each dir, then pdates dir doc's filesize and items fields. """ doclist = [] for path in dirlist: totalsize = 0 totalitems = 1 # 1 for itself totalitems_files = 0 totalitems_subdirs = 0 # file doc search with aggregate for sum filesizes # escape special characters newpath = escape_chars(path[1]) # create wildcard string and check for / (root) path if newpath == '\/': newpathwildcard = '\/*' else: newpathwildcard = newpath + '\/*' # check if / (root) path if newpath == '\/': data = { "size": 0, "query": { "query_string": { "query": "path_parent: " + newpath + "*", "analyze_wildcard": "true" } }, "aggs": { "total_size": { "sum": { "field": "filesize" } } } } else: data = { "size": 0, "query": { "query_string": { 'query': 'path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard, 'analyze_wildcard': 'true' } }, "aggs": { "total_size": { "sum": { "field": "filesize" } } } } # search ES and start scroll res = es.search(index=cliargs['index'], doc_type='file', body=data, request_timeout=config['es_timeout']) # total items sum totalitems_files += res['hits']['total'] # total file size sum totalsize += res['aggregations']['total_size']['value'] # directory doc search (subdirs) # check if / (root) path if newpath == '\/': data = { "size": 0, "query": { "query_string": { "query": "path_parent: " + newpath + "*", "analyze_wildcard": "true" } } } else: data = { "size": 0, "query": { "query_string": { 'query': 'path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard, 'analyze_wildcard': 'true' } } } # search ES and start scroll res = es.search(index=cliargs['index'], doc_type='directory', body=data, request_timeout=config['es_timeout']) # total items sum totalitems_subdirs += res['hits']['total'] # total items totalitems += totalitems_files + totalitems_subdirs # update filesize and items fields for directory (path) doc d = { '_op_type': 'update', '_index': cliargs['index'], '_type': 'directory', '_id': path[0], 'doc': {'filesize': totalsize, 'items': totalitems, 'items_files': totalitems_files, 'items_subdirs': totalitems_subdirs} } # add total cost per gb to doc if cliargs['costpergb']: d = cost_per_gb(d, path[1], path[2], path[3], path[4], 'directory') doclist.append(d) index_bulk_add(es, doclist, config, cliargs)
def calc_hot_dirs(dirlist, cliargs): """This is the calculate hotdirs worker function. It gets a directory list from the Queue, iterates over the path list and searches index2 for the same path and calculates change percent between the two. If path not in index2, change percent is 100%. Updates index's directory doc's change_percent fields. """ doclist = [] for path in dirlist: # doc search (matching path) in index2 # filename f = os.path.basename(path[1]) # parent path p = os.path.abspath(os.path.join(path[1], os.pardir)) data = { "size": 1, "_source": ['filesize', 'items', 'items_files', 'items_subdirs'], "query": { "query_string": { "query": "filename: \"" + f + "\" AND path_parent: \"" + p + "\"" } } } # search ES res = es.search(index=cliargs['hotdirs'], doc_type='directory', body=data, request_timeout=config['es_timeout']) # calculate change percent # set change percent to 100% if no matching path in index2 if len(res['hits']['hits']) == 0: changepercent_filesize = 100.0 changepercent_items = 100.0 changepercent_items_files = 100.0 changepercent_items_subdirs = 100.0 else: source = res['hits']['hits'][0]['_source'] # ((new - old) / old) * 100 try: # check if path size in index2 was 0 bytes and set change percent to 100% if path[2] > 0 and source['filesize'] == 0: changepercent_filesize = 100.0 else: changepercent_filesize = round(((path[2] - source['filesize']) / source['filesize']) * 100.0, 2) except ZeroDivisionError: changepercent_filesize = 0.0 try: # check if path items in index2 was 0 and set change percent to 100% if path[3] > 0 and source['items'] == 0: changepercent_items = 100.0 else: changepercent_items = round(((path[3] - source['items']) / source['items']) * 100.0, 2) except ZeroDivisionError: changepercent_items = 0.0 try: # check if path file items in index2 was 0 and set change percent to 100% if path[4] > 0 and source['items_files'] == 0: changepercent_items_files = 100.0 else: changepercent_items_files = round(((path[4] - source['items_files']) / source['items_files']) * 100.0, 2) except ZeroDivisionError: changepercent_items_files = 0.0 try: # check if path subdir items in index2 was 0 and set change percent to 100% if path[5] > 0 and source['items_subdirs'] == 0: changepercent_items_subdirs = 100.0 else: changepercent_items_subdirs = round(((path[5] - source['items_subdirs']) / source['items_subdirs']) * 100.0, 2) except ZeroDivisionError: changepercent_items_subdirs = 0.0 # update fields in index d = { '_op_type': 'update', '_index': cliargs['index'], '_type': 'directory', '_id': path[0], 'doc': {'change_percent_filesize': changepercent_filesize, 'change_percent_items': changepercent_items, 'change_percent_items_files': changepercent_items_files, 'change_percent_items_subdirs': changepercent_items_subdirs} } doclist.append(d) index_bulk_add(es, doclist, config, cliargs)
def tag_copier(path, cliargs): """This is the tag copier worker function. It gets a path from the Queue and searches index for the same path and copies any existing tags (from index2) Updates index's doc's tag and tag_custom fields. """ bot_logger = bot_log_setup(cliargs) jobstart = time.time() dir_id_list = [] file_id_list = [] # doc search (matching path) in index for existing tags from index2 # filename f = os.path.basename(path[0]) # parent path p = os.path.abspath(os.path.join(path[0], os.pardir)) data = { "size": 1, "_source": ['tag', 'tag_custom'], "query": { "query_string": { "query": "filename: \"" + f + "\" AND path_parent: \"" + p + "\"" } } } # check if file or directory if path[3] == 'directory': # search ES res = es.search(index=cliargs['index'], doc_type='directory', body=data, request_timeout=diskover.config['es_timeout']) else: res = es.search(index=cliargs['index'], doc_type='file', body=data, request_timeout=diskover.config['es_timeout']) # mark task done if no matching path in index and continue if len(res['hits']['hits']) == 0: bot_logger.info('*** No matching path found in index') return True # existing tag in index2 docid = res['hits']['hits'][0]['_id'] # update tag and tag_custom fields in index d = { '_op_type': 'update', '_index': cliargs['index'], '_type': path[3], '_id': docid, 'doc': { 'tag': path[1], 'tag_custom': path[2] } } if path[3] is 'directory': dir_id_list.append(d) else: file_id_list.append(d) diskover.index_bulk_add(es, dir_id_list, 'directory', diskover.config, cliargs) diskover.index_bulk_add(es, file_id_list, 'file', diskover.config, cliargs) elapsed_time = round(time.time() - jobstart, 3) bot_logger.info('*** FINISHED JOB, Elapsed Time: ' + str(elapsed_time))
def calc_dir_size(dirlist, cliargs): """This is the calculate directory size worker function. It gets a directory list from the Queue and searches ES for all files in each directory (recursive) and sums their filesizes to create a total filesize and item count for each dir. Updates dir doc's filesize and items fields. """ jobstart = time.time() bot_logger.info('*** Calculating directory sizes...') doclist = [] for path in dirlist: totalsize = 0 totalitems = 1 # 1 for itself totalitems_files = 0 totalitems_subdirs = 0 # file doc search with aggregate for sum filesizes # escape special characters newpath = diskover.escape_chars(path[1]) # create wildcard string and check for / (root) path if newpath == '\/': newpathwildcard = '\/*' else: newpathwildcard = newpath + '\/*' # check if / (root) path if newpath == '\/': data = { "size": 0, "query": { "query_string": { "query": "path_parent: " + newpath + "*", "analyze_wildcard": "true" } }, "aggs": { "total_size": { "sum": { "field": "filesize" } } } } else: data = { "size": 0, "query": { "query_string": { 'query': 'path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard, 'analyze_wildcard': 'true' } }, "aggs": { "total_size": { "sum": { "field": "filesize" } } } } # search ES and start scroll res = es.search(index=cliargs['index'], doc_type='file', body=data, request_timeout=diskover.config['es_timeout']) # total items sum totalitems_files += res['hits']['total'] # total file size sum totalsize += res['aggregations']['total_size']['value'] # directory doc search (subdirs) # search ES and start scroll res = es.search(index=cliargs['index'], doc_type='directory', body=data, request_timeout=diskover.config['es_timeout']) # total items sum totalitems_subdirs += res['hits']['total'] # total items totalitems += totalitems_files + totalitems_subdirs # update filesize and items fields for directory (path) doc d = { '_op_type': 'update', '_index': cliargs['index'], '_type': 'directory', '_id': path[0], 'doc': {'filesize': totalsize, 'items': totalitems, 'items_files': totalitems_files, 'items_subdirs': totalitems_subdirs} } doclist.append(d) diskover.index_bulk_add(es, doclist, diskover.config, cliargs) elapsed_time = round(time.time() - jobstart, 3) bot_logger.info('*** FINISHED CALC DIR, Elapsed Time: ' + str(elapsed_time))
def es_bulk_add(worker_name, dirlist, filelist, cliargs, totalcrawltime=None): if cliargs['chunkfiles']: updated_dirlist = [] # check for existing directory docs in index and update crawl time only (dirchunk) for d in dirlist: try: path = d[ 'chunkpath'] # this key determins if its part of a chunked dir crawltime = d['crawl_time'] f = os.path.basename(path) # parent path p = os.path.abspath(os.path.join(path, os.pardir)) data = { "size": 1, "_source": ['crawl_time'], "query": { "query_string": { "query": "filename: \"" + f + "\" AND path_parent: \"" + p + "\"" } } } es.indices.refresh(index=cliargs['index']) res = es.search(index=cliargs['index'], doc_type='directory', body=data, request_timeout=config['es_timeout']) if len(res['hits']['hits']) == 0: continue docid = res['hits']['hits'][0]['_id'] current_crawltime = res['hits']['hits'][0]['_source'][ 'crawl_time'] udpated_crawltime = current_crawltime + crawltime # update crawltime in index d = { '_op_type': 'update', '_index': cliargs['index'], '_type': 'directory', '_id': docid, 'doc': { 'crawl_time': udpated_crawltime } } except KeyError: pass # not part of a chunked dir updated_dirlist.append(d) dirlist = updated_dirlist starttime = time.time() docs = dirlist + filelist index_bulk_add(es, docs, config, cliargs) if not cliargs['noworkerdocs']: data = { "worker_name": worker_name, "dir_count": len(dirlist), "file_count": len(filelist), "bulk_time": round(time.time() - starttime, 6), "crawl_time": round(totalcrawltime, 6), "indexing_date": datetime.utcnow().isoformat() } es.index(index=cliargs['index'], doc_type='worker', body=data)
def calc_dir_size(dirlist, cliargs): """This is the calculate directory size worker function. It gets a directory list from the Queue and searches ES for all subdirs in each directory (recursive) and sums their filesize and items fields to create a total filesize and item count for each directory doc. Updates directory doc's filesize and items fields. """ # check if other bots are idle and throw them some jobs (dir paths) if len(dirlist) >= cliargs['batchsize']: workers_idle = 0 workers = Worker.all(connection=redis_conn) num_workers = len(workers) for w in workers: if w._state == "idle": workers_idle += 1 if workers_idle > num_workers // 2: workers_idle = True break q_len = len(q_calc) if q_len == 0 and workers_idle == True: # take half the paths randomly shuffle(dirlist) n = len(dirlist) // 2 tossdirs = dirlist[:n] dirlist = dirlist[n:] q_crawl.enqueue(calc_dir_size, args=( tossdirs, cliargs, )) doclist = [] for path in dirlist: totalitems = 1 # 1 for itself # file doc search with aggregate for sum filesizes # escape special characters newpath = escape_chars(path[1]) parentpath = escape_chars( os.path.abspath(os.path.join(path[1], os.pardir))) pathbasename = escape_chars(os.path.basename(path[1])) # create wildcard string and check for / (root) path if newpath == '\/': newpathwildcard = '\/*' else: newpathwildcard = newpath + '\/*' # check if / (root) path if newpath == '\/': data = { "size": 0, "query": { "query_string": { "query": "path_parent: " + newpath + "*", "analyze_wildcard": "true" } }, "aggs": { "total_size": { "sum": { "field": "filesize" } } } } else: data = { "size": 0, "query": { "query_string": { 'query': '(path_parent: ' + parentpath + ' AND filename: ' + pathbasename + ') OR path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard, 'analyze_wildcard': 'true' } }, "aggs": { "total_size": { "sum": { "field": "filesize" } }, "total_files": { "sum": { "field": "items_files" } }, "total_subdirs": { "sum": { "field": "items_subdirs" } } } } # search ES and start scroll for all directory doc search (subdirs) res = es.search(index=cliargs['index'], doc_type='directory', body=data, request_timeout=config['es_timeout']) # total file size sum totalsize = res['aggregations']['total_size']['value'] # total items sum for all subdirs count totalitems_subdirs = res['aggregations']['total_subdirs']['value'] # total items sum for all files count totalitems_files = res['aggregations']['total_files']['value'] totalitems += totalitems_subdirs + totalitems_files # update filesize and items fields for directory (path) doc d = { '_op_type': 'update', '_index': cliargs['index'], '_type': 'directory', '_id': path[0], 'doc': { 'filesize': totalsize, 'items': totalitems, 'items_files': totalitems_files, 'items_subdirs': totalitems_subdirs } } doclist.append(d) index_bulk_add(es, doclist, config, cliargs)
def calc_dir_size(dirlist, cliargs): """This is the calculate directory size worker function. It gets a directory list from the Queue and searches ES for all subdirs in each directory (recursive) and sums their filesize and items fields to create a total filesize and item count for each directory doc. Updates directory doc's filesize and items fields. """ doclist = [] for path in dirlist: totalitems = 1 # 1 for itself # file doc search with aggregate for sum filesizes # escape special characters newpath = escape_chars(path[1]) parentpath = escape_chars( os.path.abspath(os.path.join(path[1], os.pardir))) pathbasename = escape_chars(os.path.basename(path[1])) # create wildcard string and check for / (root) path if newpath == '\/': newpathwildcard = '\/*' else: newpathwildcard = newpath + '\/*' # check if / (root) path if newpath == '\/': data = { "size": 0, "query": { "query_string": { "query": "path_parent: " + newpath + "*", "analyze_wildcard": "true" } }, "aggs": { "total_size": { "sum": { "field": "filesize" } } } } else: data = { "size": 0, "query": { "query_string": { 'query': '(path_parent: ' + parentpath + ' AND filename: ' + pathbasename + ') OR path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard, 'analyze_wildcard': 'true' } }, "aggs": { "total_size": { "sum": { "field": "filesize" } }, "total_files": { "sum": { "field": "items_files" } }, "total_subdirs": { "sum": { "field": "items_subdirs" } } } } # search ES and start scroll for all directory doc search (subdirs) res = es.search(index=cliargs['index'], doc_type='directory', body=data, request_timeout=config['es_timeout']) # total file size sum totalsize = res['aggregations']['total_size']['value'] # total items sum for all subdirs count totalitems_subdirs = res['aggregations']['total_subdirs']['value'] # total items sum for all files count totalitems_files = res['aggregations']['total_files']['value'] totalitems += totalitems_subdirs + totalitems_files # update filesize and items fields for directory (path) doc d = { '_op_type': 'update', '_index': cliargs['index'], '_type': 'directory', '_id': path[0], 'doc': { 'filesize': totalsize, 'items': totalitems, 'items_files': totalitems_files, 'items_subdirs': totalitems_subdirs } } doclist.append(d) index_bulk_add(es, doclist, config, cliargs)