def tag_copier(path, cliargs): """This is the tag copier worker function. It gets a path from the Queue and searches index for the same path and copies any existing tags (from index2) Updates index's doc's tag and tag_custom fields. """ doclist = [] # doc search (matching path) in index for existing tags from index2 # filename f = os.path.basename(path[0]) # parent path p = os.path.abspath(os.path.join(path[0], os.pardir)) data = { "size": 1, "_source": ['tag', 'tag_custom'], "query": { "query_string": { "query": "filename: \"" + f + "\" AND path_parent: \"" + p + "\"" } } } # check if file or directory if path[3] == 'directory': # search ES res = es.search(index=cliargs['index'], doc_type='directory', body=data, request_timeout=config['es_timeout']) else: res = es.search(index=cliargs['index'], doc_type='file', body=data, request_timeout=config['es_timeout']) # mark task done if no matching path in index and continue if len(res['hits']['hits']) == 0: return True # existing tag in index2 docid = res['hits']['hits'][0]['_id'] # update tag and tag_custom fields in index d = { '_op_type': 'update', '_index': cliargs['index'], '_type': path[3], '_id': docid, 'doc': { 'tag': path[1], 'tag_custom': path[2] } } doclist.append(d) index_bulk_add(es, doclist, config, cliargs)
def get_metadata(path, cliargs): dir_source = "" filename = escape_chars(os.path.basename(path)) parent_dir = escape_chars(os.path.abspath(os.path.join(path, os.pardir))) fullpath = escape_chars(os.path.abspath(path)) data = { "size": 1, "query": { "query_string": { "query": "filename: " + filename + " AND path_parent: " + parent_dir } } } res = es.search(index=cliargs['index2'], doc_type='directory', body=data, request_timeout=config['es_timeout']) try: dir_source = res['hits']['hits'][0]['_source'] except IndexError: pass data = {"query": {"query_string": {"query": "path_parent: " + fullpath}}} files_source = [] res = es.search(index=cliargs['index2'], doc_type='file', scroll='1m', size=config['es_scrollsize'], body=data, request_timeout=config['es_timeout']) while res['hits']['hits'] and len(res['hits']['hits']) > 0: for hit in res['hits']['hits']: files_source.append(hit['_source']) # get es scroll id scroll_id = res['_scroll_id'] # use es scroll api res = es.scroll(scroll_id=scroll_id, scroll='1m', request_timeout=config['es_timeout']) return dir_source, files_source
def calc_dir_size(dirlist, cliargs): """This is the calculate directory size worker function. It gets a directory list from the Queue search ES for all files in each directory (recursive) and sums their filesizes to create a total filesize and item count for each dir, then pdates dir doc's filesize and items fields. """ doclist = [] for path in dirlist: totalsize = 0 totalitems = 1 # 1 for itself totalitems_files = 0 totalitems_subdirs = 0 # file doc search with aggregate for sum filesizes # escape special characters newpath = escape_chars(path[1]) # create wildcard string and check for / (root) path if newpath == '\/': newpathwildcard = '\/*' else: newpathwildcard = newpath + '\/*' # check if / (root) path if newpath == '\/': data = { "size": 0, "query": { "query_string": { "query": "path_parent: " + newpath + "*", "analyze_wildcard": "true" } }, "aggs": { "total_size": { "sum": { "field": "filesize" } } } } else: data = { "size": 0, "query": { "query_string": { 'query': 'path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard, 'analyze_wildcard': 'true' } }, "aggs": { "total_size": { "sum": { "field": "filesize" } } } } # search ES and start scroll res = es.search(index=cliargs['index'], doc_type='file', body=data, request_timeout=config['es_timeout']) # total items sum totalitems_files += res['hits']['total'] # total file size sum totalsize += res['aggregations']['total_size']['value'] # directory doc search (subdirs) # check if / (root) path if newpath == '\/': data = { "size": 0, "query": { "query_string": { "query": "path_parent: " + newpath + "*", "analyze_wildcard": "true" } } } else: data = { "size": 0, "query": { "query_string": { 'query': 'path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard, 'analyze_wildcard': 'true' } } } # search ES and start scroll res = es.search(index=cliargs['index'], doc_type='directory', body=data, request_timeout=config['es_timeout']) # total items sum totalitems_subdirs += res['hits']['total'] # total items totalitems += totalitems_files + totalitems_subdirs # update filesize and items fields for directory (path) doc d = { '_op_type': 'update', '_index': cliargs['index'], '_type': 'directory', '_id': path[0], 'doc': {'filesize': totalsize, 'items': totalitems, 'items_files': totalitems_files, 'items_subdirs': totalitems_subdirs} } # add total cost per gb to doc if cliargs['costpergb']: d = cost_per_gb(d, path[1], path[2], path[3], path[4], 'directory') doclist.append(d) index_bulk_add(es, doclist, config, cliargs)
def calc_hot_dirs(dirlist, cliargs): """This is the calculate hotdirs worker function. It gets a directory list from the Queue, iterates over the path list and searches index2 for the same path and calculates change percent between the two. If path not in index2, change percent is 100%. Updates index's directory doc's change_percent fields. """ doclist = [] for path in dirlist: # doc search (matching path) in index2 # filename f = os.path.basename(path[1]) # parent path p = os.path.abspath(os.path.join(path[1], os.pardir)) data = { "size": 1, "_source": ['filesize', 'items', 'items_files', 'items_subdirs'], "query": { "query_string": { "query": "filename: \"" + f + "\" AND path_parent: \"" + p + "\"" } } } # search ES res = es.search(index=cliargs['hotdirs'], doc_type='directory', body=data, request_timeout=config['es_timeout']) # calculate change percent # set change percent to 100% if no matching path in index2 if len(res['hits']['hits']) == 0: changepercent_filesize = 100.0 changepercent_items = 100.0 changepercent_items_files = 100.0 changepercent_items_subdirs = 100.0 else: source = res['hits']['hits'][0]['_source'] # ((new - old) / old) * 100 try: # check if path size in index2 was 0 bytes and set change percent to 100% if path[2] > 0 and source['filesize'] == 0: changepercent_filesize = 100.0 else: changepercent_filesize = round(((path[2] - source['filesize']) / source['filesize']) * 100.0, 2) except ZeroDivisionError: changepercent_filesize = 0.0 try: # check if path items in index2 was 0 and set change percent to 100% if path[3] > 0 and source['items'] == 0: changepercent_items = 100.0 else: changepercent_items = round(((path[3] - source['items']) / source['items']) * 100.0, 2) except ZeroDivisionError: changepercent_items = 0.0 try: # check if path file items in index2 was 0 and set change percent to 100% if path[4] > 0 and source['items_files'] == 0: changepercent_items_files = 100.0 else: changepercent_items_files = round(((path[4] - source['items_files']) / source['items_files']) * 100.0, 2) except ZeroDivisionError: changepercent_items_files = 0.0 try: # check if path subdir items in index2 was 0 and set change percent to 100% if path[5] > 0 and source['items_subdirs'] == 0: changepercent_items_subdirs = 100.0 else: changepercent_items_subdirs = round(((path[5] - source['items_subdirs']) / source['items_subdirs']) * 100.0, 2) except ZeroDivisionError: changepercent_items_subdirs = 0.0 # update fields in index d = { '_op_type': 'update', '_index': cliargs['index'], '_type': 'directory', '_id': path[0], 'doc': {'change_percent_filesize': changepercent_filesize, 'change_percent_items': changepercent_items, 'change_percent_items_files': changepercent_items_files, 'change_percent_items_subdirs': changepercent_items_subdirs} } doclist.append(d) index_bulk_add(es, doclist, config, cliargs)
def es_bulk_add(worker_name, dirlist, filelist, cliargs, totalcrawltime=None): if cliargs['chunkfiles']: updated_dirlist = [] # check for existing directory docs in index and update crawl time only (dirchunk) for d in dirlist: try: path = d[ 'chunkpath'] # this key determins if its part of a chunked dir crawltime = d['crawl_time'] f = os.path.basename(path) # parent path p = os.path.abspath(os.path.join(path, os.pardir)) data = { "size": 1, "_source": ['crawl_time'], "query": { "query_string": { "query": "filename: \"" + f + "\" AND path_parent: \"" + p + "\"" } } } es.indices.refresh(index=cliargs['index']) res = es.search(index=cliargs['index'], doc_type='directory', body=data, request_timeout=config['es_timeout']) if len(res['hits']['hits']) == 0: continue docid = res['hits']['hits'][0]['_id'] current_crawltime = res['hits']['hits'][0]['_source'][ 'crawl_time'] udpated_crawltime = current_crawltime + crawltime # update crawltime in index d = { '_op_type': 'update', '_index': cliargs['index'], '_type': 'directory', '_id': docid, 'doc': { 'crawl_time': udpated_crawltime } } except KeyError: pass # not part of a chunked dir updated_dirlist.append(d) dirlist = updated_dirlist starttime = time.time() docs = dirlist + filelist index_bulk_add(es, docs, config, cliargs) if not cliargs['noworkerdocs']: data = { "worker_name": worker_name, "dir_count": len(dirlist), "file_count": len(filelist), "bulk_time": round(time.time() - starttime, 6), "crawl_time": round(totalcrawltime, 6), "indexing_date": datetime.utcnow().isoformat() } es.index(index=cliargs['index'], doc_type='worker', body=data)
def calc_dir_size(dirlist, cliargs): """This is the calculate directory size worker function. It gets a directory list from the Queue and searches ES for all subdirs in each directory (recursive) and sums their filesize and items fields to create a total filesize and item count for each directory doc. Updates directory doc's filesize and items fields. """ # check if other bots are idle and throw them some jobs (dir paths) if len(dirlist) >= cliargs['batchsize']: workers_idle = 0 workers = Worker.all(connection=redis_conn) num_workers = len(workers) for w in workers: if w._state == "idle": workers_idle += 1 if workers_idle > num_workers // 2: workers_idle = True break q_len = len(q_calc) if q_len == 0 and workers_idle == True: # take half the paths randomly shuffle(dirlist) n = len(dirlist) // 2 tossdirs = dirlist[:n] dirlist = dirlist[n:] q_crawl.enqueue(calc_dir_size, args=( tossdirs, cliargs, )) doclist = [] for path in dirlist: totalitems = 1 # 1 for itself # file doc search with aggregate for sum filesizes # escape special characters newpath = escape_chars(path[1]) parentpath = escape_chars( os.path.abspath(os.path.join(path[1], os.pardir))) pathbasename = escape_chars(os.path.basename(path[1])) # create wildcard string and check for / (root) path if newpath == '\/': newpathwildcard = '\/*' else: newpathwildcard = newpath + '\/*' # check if / (root) path if newpath == '\/': data = { "size": 0, "query": { "query_string": { "query": "path_parent: " + newpath + "*", "analyze_wildcard": "true" } }, "aggs": { "total_size": { "sum": { "field": "filesize" } } } } else: data = { "size": 0, "query": { "query_string": { 'query': '(path_parent: ' + parentpath + ' AND filename: ' + pathbasename + ') OR path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard, 'analyze_wildcard': 'true' } }, "aggs": { "total_size": { "sum": { "field": "filesize" } }, "total_files": { "sum": { "field": "items_files" } }, "total_subdirs": { "sum": { "field": "items_subdirs" } } } } # search ES and start scroll for all directory doc search (subdirs) res = es.search(index=cliargs['index'], doc_type='directory', body=data, request_timeout=config['es_timeout']) # total file size sum totalsize = res['aggregations']['total_size']['value'] # total items sum for all subdirs count totalitems_subdirs = res['aggregations']['total_subdirs']['value'] # total items sum for all files count totalitems_files = res['aggregations']['total_files']['value'] totalitems += totalitems_subdirs + totalitems_files # update filesize and items fields for directory (path) doc d = { '_op_type': 'update', '_index': cliargs['index'], '_type': 'directory', '_id': path[0], 'doc': { 'filesize': totalsize, 'items': totalitems, 'items_files': totalitems_files, 'items_subdirs': totalitems_subdirs } } doclist.append(d) index_bulk_add(es, doclist, config, cliargs)
def calc_dir_size(dirlist, cliargs): """This is the calculate directory size worker function. It gets a directory list from the Queue and searches ES for all subdirs in each directory (recursive) and sums their filesize and items fields to create a total filesize and item count for each directory doc. Updates directory doc's filesize and items fields. """ doclist = [] for path in dirlist: totalitems = 1 # 1 for itself # file doc search with aggregate for sum filesizes # escape special characters newpath = escape_chars(path[1]) parentpath = escape_chars( os.path.abspath(os.path.join(path[1], os.pardir))) pathbasename = escape_chars(os.path.basename(path[1])) # create wildcard string and check for / (root) path if newpath == '\/': newpathwildcard = '\/*' else: newpathwildcard = newpath + '\/*' # check if / (root) path if newpath == '\/': data = { "size": 0, "query": { "query_string": { "query": "path_parent: " + newpath + "*", "analyze_wildcard": "true" } }, "aggs": { "total_size": { "sum": { "field": "filesize" } } } } else: data = { "size": 0, "query": { "query_string": { 'query': '(path_parent: ' + parentpath + ' AND filename: ' + pathbasename + ') OR path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard, 'analyze_wildcard': 'true' } }, "aggs": { "total_size": { "sum": { "field": "filesize" } }, "total_files": { "sum": { "field": "items_files" } }, "total_subdirs": { "sum": { "field": "items_subdirs" } } } } # search ES and start scroll for all directory doc search (subdirs) res = es.search(index=cliargs['index'], doc_type='directory', body=data, request_timeout=config['es_timeout']) # total file size sum totalsize = res['aggregations']['total_size']['value'] # total items sum for all subdirs count totalitems_subdirs = res['aggregations']['total_subdirs']['value'] # total items sum for all files count totalitems_files = res['aggregations']['total_files']['value'] totalitems += totalitems_subdirs + totalitems_files # update filesize and items fields for directory (path) doc d = { '_op_type': 'update', '_index': cliargs['index'], '_type': 'directory', '_id': path[0], 'doc': { 'filesize': totalsize, 'items': totalitems, 'items_files': totalitems_files, 'items_subdirs': totalitems_subdirs } } doclist.append(d) index_bulk_add(es, doclist, config, cliargs)