def qumulo_get_file_meta(path, cliargs, reindex_dict): filename = path['name'] # check if file is in exluded_files list extension = os.path.splitext(filename)[1][1:].strip().lower() if diskover_worker_bot.file_excluded(filename, extension, path['path'], cliargs['verbose']): return None # get file size (bytes) size = path['size'] # Skip files smaller than minsize cli flag if size < cliargs['minsize']: return None # check file modified time mtime_utc = path['modification_time'] mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S')) # Convert time in days (mtime cli arg) to seconds time_sec = cliargs['mtime'] * 86400 file_mtime_sec = time.time() - mtime_unix # Only process files modified at least x days ago if file_mtime_sec < time_sec: return None # get change time ctime_utc = path['change_time'] # get creation time creation_time_utc = path['creation_time'] # create md5 hash of file using metadata filesize and mtime filestring = str(size) + str(mtime_unix) filehash = hashlib.md5(filestring.encode('utf-8')).hexdigest() # get time indextime_utc = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f") # get absolute path of parent directory parentdir = os.path.abspath(os.path.join(path['path'], os.pardir)) # get user id of owner uid = int(path['owner']) # try to get owner user name # first check cache if uid in diskover_worker_bot.uids: owner = diskover_worker_bot.owners[uid] # not in cache else: try: owner = pwd.getpwuid(uid).pw_name.split('\\') # remove domain before owner if len(owner) == 2: owner = owner[1] else: owner = owner[0] # if we can't find the owner's user name, use the uid number except KeyError: owner = uid # store it in cache if not uid in diskover_worker_bot.uids: diskover_worker_bot.uids.append(uid) diskover_worker_bot.owners[uid] = owner # get group id gid = int(path['group']) # try to get group name # first check cache if gid in diskover_worker_bot.gids: group = diskover_worker_bot.groups[gid] # not in cache else: try: group = grp.getgrgid(gid).gr_name.split('\\') # remove domain before group if len(group) == 2: group = group[1] else: group = group[0] # if we can't find the group name, use the gid number except KeyError: group = gid # store in cache if not gid in diskover_worker_bot.gids: diskover_worker_bot.gids.append(gid) diskover_worker_bot.groups[gid] = group # create file metadata dictionary filemeta_dict = { "filename": filename, "extension": extension, "path_parent": parentdir, "filesize": size, "owner": owner, "group": group, "last_modified": mtime_utc, "creation_time": creation_time_utc, "last_change": ctime_utc, "hardlinks": path['num_links'], "inode": path['id'], "filehash": filehash, "tag": "", "tag_custom": "", "dupe_md5": "", "indexing_date": indextime_utc, "worker_name": diskover_worker_bot.get_worker_name() } # search for and copy over any existing tags from reindex_dict for sublist in reindex_dict['file']: if sublist[0] == path['path']: filemeta_dict['tag'] = sublist[1] filemeta_dict['tag_custom'] = sublist[2] break # check plugins for adding extra meta data to filemeta_dict for plugin in diskover.plugins: try: # check if plugin is for file doc mappings = {'mappings': {'file': {'properties': {}}}} plugin.add_mappings(mappings) filemeta_dict.update(plugin.add_meta(path['path'])) except KeyError: pass return filemeta_dict
def qumulo_get_dir_meta(path, cliargs, reindex_dict, redis_conn): if path['path'] != '/': fullpath = path['path'].rstrip(os.path.sep) else: fullpath = path['path'] mtime_utc = path['modification_time'] mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S')) ctime_utc = path['change_time'] ctime_unix = time.mktime(time.strptime(ctime_utc, '%Y-%m-%dT%H:%M:%S')) creation_time_utc = path['creation_time'] if cliargs['index2']: # check if directory times cached in Redis redis_dirtime = redis_conn.get( fullpath.encode('utf-8', errors='ignore')) if redis_dirtime: cached_times = float(redis_dirtime.decode('utf-8')) # check if cached times are the same as on disk current_times = float(mtime_unix + ctime_unix) if cached_times == current_times: return "sametimes" # get time now in utc indextime_utc = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f") # get user id of owner try: uid = int(path['owner']) # try to get owner user name # first check cache if uid in diskover_worker_bot.uids: owner = diskover_worker_bot.owners[uid] # not in cache else: try: owner = pwd.getpwuid(uid).pw_name.split('\\') # remove domain before owner if len(owner) == 2: owner = owner[1] else: owner = owner[0] # if we can't find the owner's user name, use the uid number except KeyError: owner = uid # store it in cache if not uid in diskover_worker_bot.uids: diskover_worker_bot.uids.append(uid) diskover_worker_bot.owners[uid] = owner except ValueError: # Qumulo local user type owner = path['owner'] # get group id try: gid = int(path['group']) # try to get group name # first check cache if gid in diskover_worker_bot.gids: group = diskover_worker_bot.groups[gid] # not in cache else: try: group = grp.getgrgid(gid).gr_name.split('\\') # remove domain before group if len(group) == 2: group = group[1] else: group = group[0] # if we can't find the group name, use the gid number except KeyError: group = gid # store in cache if not gid in diskover_worker_bot.gids: diskover_worker_bot.gids.append(gid) diskover_worker_bot.groups[gid] = group except ValueError: # Qumulo local group type group = path['group'] filename = path['name'] parentdir = os.path.abspath(os.path.join(fullpath, os.pardir)) dirmeta_dict = { "filename": filename, "path_parent": parentdir, "filesize": 0, "items": 1, # 1 for itself "items_files": 0, "items_subdirs": 0, "last_modified": mtime_utc, "creation_time": creation_time_utc, "last_change": ctime_utc, "hardlinks": path['num_links'], "inode": path['id'], "owner": owner, "group": group, "tag": "", "tag_custom": "", "indexing_date": indextime_utc, "worker_name": diskover_worker_bot.get_worker_name() } # search for and copy over any existing tags from reindex_dict for sublist in reindex_dict['directory']: if sublist[0] == fullpath: dirmeta_dict['tag'] = sublist[1] dirmeta_dict['tag_custom'] = sublist[2] break # check plugins for adding extra meta data to dirmeta_dict for plugin in diskover.plugins: try: # check if plugin is for directory doc mappings = {'mappings': {'directory': {'properties': {}}}} plugin.add_mappings(mappings) dirmeta_dict.update(plugin.add_meta(fullpath)) except KeyError: pass # cache directory times in Redis redis_conn.set(fullpath.encode('utf-8', errors='ignore'), mtime_unix + ctime_unix, ex=diskover.config['redis_dirtimesttl']) return dirmeta_dict
import gzip import csv from datetime import datetime import time import hashlib try: from Queue import Queue as pyQueue except ImportError: from queue import Queue as pyQueue from threading import Thread, RLock import diskover import diskover_worker_bot fake_dirs = [] buckets = [] workername = diskover_worker_bot.get_worker_name() # create queue and threads for bulk adding to ES s3queue = pyQueue() s3threadlock = RLock() def process_line(row, tree_dirs, tree_files, tree_crawltimes, cliargs): global fake_dirs starttime = time.time() n = 2 # S3 Inventory csv column headers inventory_dict = {'s3_bucket': row[0], 's3_key': row[1]} try: inventory_dict['s3_size'] = int(row[n])
def process_s3_inventory(inventory_file, cliargs): """Process s3 inventory function. Takes an S3 inventory file (gzipped csv), processes and bulk adds it into diskover index. """ jobstart = time.time() tree = [] workername = diskover_worker_bot.get_worker_name() with gzip.open(inventory_file, mode='rt') as f: reader = csv.reader(f, delimiter=',', quotechar='"') x = 0 for row in reader: if x == 0: # create fake root /bucketname directory entry for s3 bucket time_utc_now = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S") time_utc_epoch_start = "1970-01-01T00:00:00" root_dict = {} root_dict['filename'] = row[0] root_dict['path_parent'] = "/s3" root_dict["filesize"] = 0 root_dict["items"] = 1 # 1 for itself root_dict["items_files"] = 0 root_dict["items_subdirs"] = 0 root_dict["last_modified"] = time_utc_epoch_start root_dict["tag"] = "" root_dict["tag_custom"] = "" root_dict["indexing_date"] = time_utc_now root_dict["worker_name"] = workername root_dict["change_percent_filesize"] = "" root_dict["change_percent_items"] = "" root_dict["change_percent_items_files"] = "" root_dict["change_percent_items_subdirs"] = "" tree.append(('directory', root_dict)) tree.append(('crawltime', '/s3/' + row[0], 0)) starttime = time.time() n = 2 # S3 Inventory csv column headers inventory_dict = {'s3_bucket': row[0], 's3_key': row[1]} try: inventory_dict['s3_size'] = int(row[n]) n = n + 1 except IndexError: pass try: inventory_dict['s3_last_modified_date'] = row[n] n = n + 1 except IndexError: pass try: inventory_dict['s3_etag'] = row[n] n = n + 1 except IndexError: pass try: inventory_dict['s3_storage_class'] = row[n] n = n + 1 except IndexError: pass try: inventory_dict['s3_multipart_upload'] = row[n] n = n + 1 except IndexError: pass try: inventory_dict['s3_replication_status'] = row[n] n = n + 1 except IndexError: pass try: inventory_dict['s3_encryption_status'] = row[n] except IndexError: pass # prepare inventory dict for diskover index # fake path /s3/bucketname/key bucket = '/s3/' + row[0] + '/' path = os.path.join(bucket, inventory_dict['s3_key']) # check if directory if path.endswith('/'): isdir = True path = path.rstrip('/') else: isdir = False size = inventory_dict['s3_size'] # filename filename = os.path.basename(path) # check if file is in exluded_files list extension = os.path.splitext(filename)[1][1:].strip().lower() if diskover_worker_bot.file_excluded(filename, extension, path, cliargs['verbose']): continue # Skip files smaller than minsize cli flag if not isdir and size < cliargs['minsize']: continue # modified time mtime_utc = inventory_dict['s3_last_modified_date'].partition( '.')[0] # modified time in unix mtime_unix = time.mktime( time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S')) # get time indextime_utc = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f") # get absolute path of parent directory parentdir = os.path.abspath(os.path.join(path, os.pardir)) # absolute full path fullpath = os.path.abspath(os.path.join(parentdir, filename)) # remove any keys (fields) we don't want to add to ES inventory_dict.pop('s3_size', None) inventory_dict.pop('s3_last_modified_date', None) if isdir: # directory inventory_dict['filename'] = filename inventory_dict['path_parent'] = parentdir inventory_dict["filesize"] = 0 inventory_dict["items"] = 1 # 1 for itself inventory_dict["items_files"] = 0 inventory_dict["items_subdirs"] = 0 inventory_dict["last_modified"] = mtime_utc inventory_dict["tag"] = "" inventory_dict["tag_custom"] = "" inventory_dict["indexing_date"] = indextime_utc inventory_dict["worker_name"] = workername inventory_dict["change_percent_filesize"] = "" inventory_dict["change_percent_items"] = "" inventory_dict["change_percent_items_files"] = "" inventory_dict["change_percent_items_subdirs"] = "" # add any autotags to inventory_dict if cliargs['autotag'] and len( diskover.config['autotag_dirs']) > 0: diskover_worker_bot.auto_tag(inventory_dict, 'directory', mtime_unix, None, None) # check plugins for adding extra meta data to dirmeta_dict for plugin in diskover.plugins: try: # check if plugin is for directory doc mappings = { 'mappings': { 'directory': { 'properties': {} } } } plugin.add_mappings(mappings) inventory_dict.update(plugin.add_meta(fullpath)) except KeyError: pass tree.append(('directory', inventory_dict)) tree.append(('crawltime', path, (time.time() - starttime))) else: # file # Convert time in days (mtime cli arg) to seconds time_sec = cliargs['mtime'] * 86400 file_mtime_sec = time.time() - mtime_unix # Only process files modified at least x days ago if file_mtime_sec < time_sec: continue # create md5 hash of file using metadata filesize and mtime filestring = str(size) + str(mtime_unix) filehash = hashlib.md5(filestring.encode('utf-8')).hexdigest() inventory_dict['filename'] = filename inventory_dict['path_parent'] = parentdir inventory_dict["extension"] = extension inventory_dict["filesize"] = size inventory_dict["last_modified"] = mtime_utc inventory_dict["filehash"] = filehash inventory_dict["tag"] = "" inventory_dict["tag_custom"] = "" inventory_dict["dupe_md5"] = "" inventory_dict["indexing_date"] = indextime_utc inventory_dict["worker_name"] = workername # check plugins for adding extra meta data to inventory_dict for plugin in diskover.plugins: try: # check if plugin is for file doc mappings = {'mappings': {'file': {'properties': {}}}} plugin.add_mappings(mappings) inventory_dict.update(plugin.add_meta(fullpath)) except KeyError: pass # add any autotags to inventory_dict if cliargs['autotag'] and len( diskover.config['autotag_files']) > 0: diskover_worker_bot.auto_tag(inventory_dict, 'file', mtime_unix, None, None) tree.append(('file', inventory_dict)) if len(tree) >= diskover.config['es_chunksize']: diskover_worker_bot.es_bulk_adder(tree, cliargs) del tree[:] x = x + 1 if len(tree) > 0: diskover_worker_bot.es_bulk_adder(tree, cliargs) elapsed_time = round(time.time() - jobstart, 3) diskover_worker_bot.bot_logger.info('*** FINISHED JOB, Elapsed Time: ' + str(elapsed_time))
def qumulo_get_file_meta(path, cliargs, reindex_dict, bot_logger): filename = path['name'] # check if file is in exluded_files list extension = os.path.splitext(filename)[1][1:].strip().lower() if diskover.file_excluded(filename, extension, path['path'], diskover.config, bot_logger, cliargs['verbose']): return None # get file size (bytes) size = path['size'] # Skip files smaller than minsize cli flag if size < cliargs['minsize']: return None # check file modified time mtime_utc = path['modification_time'] mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S')) # Convert time in days (mtime cli arg) to seconds time_sec = cliargs['mtime'] * 86400 file_mtime_sec = time.time() - mtime_unix # Only process files modified at least x days ago if file_mtime_sec < time_sec: return None # get change time ctime_utc = path['change_time'] # get creation time creation_time_utc = path['creation_time'] # create md5 hash of file using metadata filesize and mtime filestring = str(size) + str(mtime_unix) filehash = hashlib.md5(filestring.encode('utf-8')).hexdigest() # get time indextime_utc = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f") # get absolute path of parent directory parentdir = os.path.abspath(os.path.join(path['path'], os.pardir)) # create file metadata dictionary filemeta_dict = { "filename": filename, "extension": extension, "path_parent": parentdir, "filesize": size, "owner": path['owner'], "group": path['group'], "last_modified": mtime_utc, "creation_time": creation_time_utc, "last_change": ctime_utc, "hardlinks": path['num_links'], "inode": path['id'], "filehash": filehash, "tag": "", "tag_custom": "", "dupe_md5": "", "indexing_date": indextime_utc, "worker_name": diskover_worker_bot.get_worker_name() } # search for and copy over any existing tags from reindex_dict for sublist in reindex_dict['file']: if sublist[0] == path['path']: filemeta_dict['tag'] = sublist[1] filemeta_dict['tag_custom'] = sublist[2] break # check plugins for adding extra meta data to filemeta_dict for plugin in diskover.plugins: try: # check if plugin is for file doc mappings = {'mappings': {'file': {'properties': {}}}} plugin.add_mappings(mappings) filemeta_dict.update(plugin.add_meta(path['path'])) except KeyError: pass return filemeta_dict
def qumulo_get_dir_meta(path, cliargs, reindex_dict, bot_logger, redis_conn): if path['path'] != '/': fullpath = path['path'].rstrip(os.path.sep) else: fullpath = path['path'] mtime_utc = path['modification_time'] mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S')) ctime_utc = path['change_time'] ctime_unix = time.mktime(time.strptime(ctime_utc, '%Y-%m-%dT%H:%M:%S')) creation_time_utc = path['creation_time'] if cliargs['index2']: # check if directory times cached in Redis redis_dirtime = redis_conn.get( fullpath.encode('utf-8', errors='ignore')) if redis_dirtime: cached_times = float(redis_dirtime.decode('utf-8')) # check if cached times are the same as on disk current_times = float(mtime_unix + ctime_unix) if cached_times == current_times: return "sametimes" # get time now in utc indextime_utc = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f") filename = path['name'] parentdir = os.path.abspath(os.path.join(fullpath, os.pardir)) dirmeta_dict = { "filename": filename, "path_parent": parentdir, "filesize": 0, "items": 1, # itself "last_modified": mtime_utc, "creation_time": creation_time_utc, "last_change": ctime_utc, "hardlinks": path['num_links'], "inode": path['id'], "owner": path['owner'], "group": path['group'], "tag": "", "tag_custom": "", "indexing_date": indextime_utc, "worker_name": diskover_worker_bot.get_worker_name() } # search for and copy over any existing tags from reindex_dict for sublist in reindex_dict['directory']: if sublist[0] == fullpath: dirmeta_dict['tag'] = sublist[1] dirmeta_dict['tag_custom'] = sublist[2] break # check plugins for adding extra meta data to dirmeta_dict for plugin in diskover.plugins: try: # check if plugin is for directory doc mappings = {'mappings': {'directory': {'properties': {}}}} plugin.add_mappings(mappings) dirmeta_dict.update(plugin.add_meta(fullpath)) except KeyError: pass # cache directory times in Redis redis_conn.set(fullpath.encode('utf-8', errors='ignore'), mtime_unix + ctime_unix, ex=diskover.config['redis_dirtimesttl']) return dirmeta_dict