コード例 #1
0
def qumulo_get_file_meta(path, cliargs, reindex_dict):
    filename = path['name']

    # check if file is in exluded_files list
    extension = os.path.splitext(filename)[1][1:].strip().lower()
    if diskover_worker_bot.file_excluded(filename, extension, path['path'],
                                         cliargs['verbose']):
        return None

    # get file size (bytes)
    size = path['size']

    # Skip files smaller than minsize cli flag
    if size < cliargs['minsize']:
        return None

    # check file modified time
    mtime_utc = path['modification_time']
    mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S'))

    # Convert time in days (mtime cli arg) to seconds
    time_sec = cliargs['mtime'] * 86400
    file_mtime_sec = time.time() - mtime_unix
    # Only process files modified at least x days ago
    if file_mtime_sec < time_sec:
        return None

    # get change time
    ctime_utc = path['change_time']
    # get creation time
    creation_time_utc = path['creation_time']

    # create md5 hash of file using metadata filesize and mtime
    filestring = str(size) + str(mtime_unix)
    filehash = hashlib.md5(filestring.encode('utf-8')).hexdigest()
    # get time
    indextime_utc = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")
    # get absolute path of parent directory
    parentdir = os.path.abspath(os.path.join(path['path'], os.pardir))
    # get user id of owner
    uid = int(path['owner'])
    # try to get owner user name
    # first check cache
    if uid in diskover_worker_bot.uids:
        owner = diskover_worker_bot.owners[uid]
    # not in cache
    else:
        try:
            owner = pwd.getpwuid(uid).pw_name.split('\\')
            # remove domain before owner
            if len(owner) == 2:
                owner = owner[1]
            else:
                owner = owner[0]
        # if we can't find the owner's user name, use the uid number
        except KeyError:
            owner = uid
        # store it in cache
        if not uid in diskover_worker_bot.uids:
            diskover_worker_bot.uids.append(uid)
            diskover_worker_bot.owners[uid] = owner
    # get group id
    gid = int(path['group'])
    # try to get group name
    # first check cache
    if gid in diskover_worker_bot.gids:
        group = diskover_worker_bot.groups[gid]
    # not in cache
    else:
        try:
            group = grp.getgrgid(gid).gr_name.split('\\')
            # remove domain before group
            if len(group) == 2:
                group = group[1]
            else:
                group = group[0]
        # if we can't find the group name, use the gid number
        except KeyError:
            group = gid
        # store in cache
        if not gid in diskover_worker_bot.gids:
            diskover_worker_bot.gids.append(gid)
            diskover_worker_bot.groups[gid] = group

    # create file metadata dictionary
    filemeta_dict = {
        "filename": filename,
        "extension": extension,
        "path_parent": parentdir,
        "filesize": size,
        "owner": owner,
        "group": group,
        "last_modified": mtime_utc,
        "creation_time": creation_time_utc,
        "last_change": ctime_utc,
        "hardlinks": path['num_links'],
        "inode": path['id'],
        "filehash": filehash,
        "tag": "",
        "tag_custom": "",
        "dupe_md5": "",
        "indexing_date": indextime_utc,
        "worker_name": diskover_worker_bot.get_worker_name()
    }

    # search for and copy over any existing tags from reindex_dict
    for sublist in reindex_dict['file']:
        if sublist[0] == path['path']:
            filemeta_dict['tag'] = sublist[1]
            filemeta_dict['tag_custom'] = sublist[2]
            break

    # check plugins for adding extra meta data to filemeta_dict
    for plugin in diskover.plugins:
        try:
            # check if plugin is for file doc
            mappings = {'mappings': {'file': {'properties': {}}}}
            plugin.add_mappings(mappings)
            filemeta_dict.update(plugin.add_meta(path['path']))
        except KeyError:
            pass

    return filemeta_dict
コード例 #2
0
def qumulo_get_dir_meta(path, cliargs, reindex_dict, redis_conn):
    if path['path'] != '/':
        fullpath = path['path'].rstrip(os.path.sep)
    else:
        fullpath = path['path']
    mtime_utc = path['modification_time']
    mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S'))
    ctime_utc = path['change_time']
    ctime_unix = time.mktime(time.strptime(ctime_utc, '%Y-%m-%dT%H:%M:%S'))
    creation_time_utc = path['creation_time']
    if cliargs['index2']:
        # check if directory times cached in Redis
        redis_dirtime = redis_conn.get(
            fullpath.encode('utf-8', errors='ignore'))
        if redis_dirtime:
            cached_times = float(redis_dirtime.decode('utf-8'))
            # check if cached times are the same as on disk
            current_times = float(mtime_unix + ctime_unix)
            if cached_times == current_times:
                return "sametimes"
    # get time now in utc
    indextime_utc = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")
    # get user id of owner
    try:
        uid = int(path['owner'])
        # try to get owner user name
        # first check cache
        if uid in diskover_worker_bot.uids:
            owner = diskover_worker_bot.owners[uid]
        # not in cache
        else:
            try:
                owner = pwd.getpwuid(uid).pw_name.split('\\')
                # remove domain before owner
                if len(owner) == 2:
                    owner = owner[1]
                else:
                    owner = owner[0]
            # if we can't find the owner's user name, use the uid number
            except KeyError:
                owner = uid
            # store it in cache
            if not uid in diskover_worker_bot.uids:
                diskover_worker_bot.uids.append(uid)
                diskover_worker_bot.owners[uid] = owner
    except ValueError:  # Qumulo local user type
        owner = path['owner']
    # get group id
    try:
        gid = int(path['group'])
        # try to get group name
        # first check cache
        if gid in diskover_worker_bot.gids:
            group = diskover_worker_bot.groups[gid]
        # not in cache
        else:
            try:
                group = grp.getgrgid(gid).gr_name.split('\\')
                # remove domain before group
                if len(group) == 2:
                    group = group[1]
                else:
                    group = group[0]
            # if we can't find the group name, use the gid number
            except KeyError:
                group = gid
            # store in cache
            if not gid in diskover_worker_bot.gids:
                diskover_worker_bot.gids.append(gid)
                diskover_worker_bot.groups[gid] = group
    except ValueError:  # Qumulo local group type
        group = path['group']

    filename = path['name']
    parentdir = os.path.abspath(os.path.join(fullpath, os.pardir))

    dirmeta_dict = {
        "filename": filename,
        "path_parent": parentdir,
        "filesize": 0,
        "items": 1,  # 1 for itself
        "items_files": 0,
        "items_subdirs": 0,
        "last_modified": mtime_utc,
        "creation_time": creation_time_utc,
        "last_change": ctime_utc,
        "hardlinks": path['num_links'],
        "inode": path['id'],
        "owner": owner,
        "group": group,
        "tag": "",
        "tag_custom": "",
        "indexing_date": indextime_utc,
        "worker_name": diskover_worker_bot.get_worker_name()
    }

    # search for and copy over any existing tags from reindex_dict
    for sublist in reindex_dict['directory']:
        if sublist[0] == fullpath:
            dirmeta_dict['tag'] = sublist[1]
            dirmeta_dict['tag_custom'] = sublist[2]
            break

    # check plugins for adding extra meta data to dirmeta_dict
    for plugin in diskover.plugins:
        try:
            # check if plugin is for directory doc
            mappings = {'mappings': {'directory': {'properties': {}}}}
            plugin.add_mappings(mappings)
            dirmeta_dict.update(plugin.add_meta(fullpath))
        except KeyError:
            pass

    # cache directory times in Redis
    redis_conn.set(fullpath.encode('utf-8', errors='ignore'),
                   mtime_unix + ctime_unix,
                   ex=diskover.config['redis_dirtimesttl'])

    return dirmeta_dict
コード例 #3
0
ファイル: diskover_s3.py プロジェクト: tolmanam/diskover
import gzip
import csv
from datetime import datetime
import time
import hashlib
try:
    from Queue import Queue as pyQueue
except ImportError:
    from queue import Queue as pyQueue
from threading import Thread, RLock
import diskover
import diskover_worker_bot

fake_dirs = []
buckets = []
workername = diskover_worker_bot.get_worker_name()

# create queue and threads for bulk adding to ES
s3queue = pyQueue()
s3threadlock = RLock()


def process_line(row, tree_dirs, tree_files, tree_crawltimes, cliargs):
    global fake_dirs

    starttime = time.time()
    n = 2
    # S3 Inventory csv column headers
    inventory_dict = {'s3_bucket': row[0], 's3_key': row[1]}
    try:
        inventory_dict['s3_size'] = int(row[n])
コード例 #4
0
def process_s3_inventory(inventory_file, cliargs):
    """Process s3 inventory function.
    Takes an S3 inventory file (gzipped csv), processes and bulk adds it
    into diskover index.
    """
    jobstart = time.time()
    tree = []
    workername = diskover_worker_bot.get_worker_name()

    with gzip.open(inventory_file, mode='rt') as f:
        reader = csv.reader(f, delimiter=',', quotechar='"')
        x = 0
        for row in reader:
            if x == 0:
                # create fake root /bucketname directory entry for s3 bucket
                time_utc_now = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")
                time_utc_epoch_start = "1970-01-01T00:00:00"
                root_dict = {}
                root_dict['filename'] = row[0]
                root_dict['path_parent'] = "/s3"
                root_dict["filesize"] = 0
                root_dict["items"] = 1  # 1 for itself
                root_dict["items_files"] = 0
                root_dict["items_subdirs"] = 0
                root_dict["last_modified"] = time_utc_epoch_start
                root_dict["tag"] = ""
                root_dict["tag_custom"] = ""
                root_dict["indexing_date"] = time_utc_now
                root_dict["worker_name"] = workername
                root_dict["change_percent_filesize"] = ""
                root_dict["change_percent_items"] = ""
                root_dict["change_percent_items_files"] = ""
                root_dict["change_percent_items_subdirs"] = ""
                tree.append(('directory', root_dict))
                tree.append(('crawltime', '/s3/' + row[0], 0))
            starttime = time.time()
            n = 2
            # S3 Inventory csv column headers
            inventory_dict = {'s3_bucket': row[0], 's3_key': row[1]}
            try:
                inventory_dict['s3_size'] = int(row[n])
                n = n + 1
            except IndexError:
                pass
            try:
                inventory_dict['s3_last_modified_date'] = row[n]
                n = n + 1
            except IndexError:
                pass
            try:
                inventory_dict['s3_etag'] = row[n]
                n = n + 1
            except IndexError:
                pass
            try:
                inventory_dict['s3_storage_class'] = row[n]
                n = n + 1
            except IndexError:
                pass
            try:
                inventory_dict['s3_multipart_upload'] = row[n]
                n = n + 1
            except IndexError:
                pass
            try:
                inventory_dict['s3_replication_status'] = row[n]
                n = n + 1
            except IndexError:
                pass
            try:
                inventory_dict['s3_encryption_status'] = row[n]
            except IndexError:
                pass

            # prepare inventory dict for diskover index

            # fake path /s3/bucketname/key
            bucket = '/s3/' + row[0] + '/'
            path = os.path.join(bucket, inventory_dict['s3_key'])
            # check if directory
            if path.endswith('/'):
                isdir = True
                path = path.rstrip('/')
            else:
                isdir = False
            size = inventory_dict['s3_size']
            # filename
            filename = os.path.basename(path)
            # check if file is in exluded_files list
            extension = os.path.splitext(filename)[1][1:].strip().lower()
            if diskover_worker_bot.file_excluded(filename, extension, path,
                                                 cliargs['verbose']):
                continue
            # Skip files smaller than minsize cli flag
            if not isdir and size < cliargs['minsize']:
                continue
            # modified time
            mtime_utc = inventory_dict['s3_last_modified_date'].partition(
                '.')[0]
            # modified time in unix
            mtime_unix = time.mktime(
                time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S'))
            # get time
            indextime_utc = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")
            # get absolute path of parent directory
            parentdir = os.path.abspath(os.path.join(path, os.pardir))
            # absolute full path
            fullpath = os.path.abspath(os.path.join(parentdir, filename))

            # remove any keys (fields) we don't want to add to ES
            inventory_dict.pop('s3_size', None)
            inventory_dict.pop('s3_last_modified_date', None)

            if isdir:  # directory
                inventory_dict['filename'] = filename
                inventory_dict['path_parent'] = parentdir
                inventory_dict["filesize"] = 0
                inventory_dict["items"] = 1  # 1 for itself
                inventory_dict["items_files"] = 0
                inventory_dict["items_subdirs"] = 0
                inventory_dict["last_modified"] = mtime_utc
                inventory_dict["tag"] = ""
                inventory_dict["tag_custom"] = ""
                inventory_dict["indexing_date"] = indextime_utc
                inventory_dict["worker_name"] = workername
                inventory_dict["change_percent_filesize"] = ""
                inventory_dict["change_percent_items"] = ""
                inventory_dict["change_percent_items_files"] = ""
                inventory_dict["change_percent_items_subdirs"] = ""

                # add any autotags to inventory_dict
                if cliargs['autotag'] and len(
                        diskover.config['autotag_dirs']) > 0:
                    diskover_worker_bot.auto_tag(inventory_dict, 'directory',
                                                 mtime_unix, None, None)

                # check plugins for adding extra meta data to dirmeta_dict
                for plugin in diskover.plugins:
                    try:
                        # check if plugin is for directory doc
                        mappings = {
                            'mappings': {
                                'directory': {
                                    'properties': {}
                                }
                            }
                        }
                        plugin.add_mappings(mappings)
                        inventory_dict.update(plugin.add_meta(fullpath))
                    except KeyError:
                        pass

                tree.append(('directory', inventory_dict))
                tree.append(('crawltime', path, (time.time() - starttime)))

            else:  # file
                # Convert time in days (mtime cli arg) to seconds
                time_sec = cliargs['mtime'] * 86400
                file_mtime_sec = time.time() - mtime_unix
                # Only process files modified at least x days ago
                if file_mtime_sec < time_sec:
                    continue
                # create md5 hash of file using metadata filesize and mtime
                filestring = str(size) + str(mtime_unix)
                filehash = hashlib.md5(filestring.encode('utf-8')).hexdigest()

                inventory_dict['filename'] = filename
                inventory_dict['path_parent'] = parentdir
                inventory_dict["extension"] = extension
                inventory_dict["filesize"] = size
                inventory_dict["last_modified"] = mtime_utc
                inventory_dict["filehash"] = filehash
                inventory_dict["tag"] = ""
                inventory_dict["tag_custom"] = ""
                inventory_dict["dupe_md5"] = ""
                inventory_dict["indexing_date"] = indextime_utc
                inventory_dict["worker_name"] = workername

                # check plugins for adding extra meta data to inventory_dict
                for plugin in diskover.plugins:
                    try:
                        # check if plugin is for file doc
                        mappings = {'mappings': {'file': {'properties': {}}}}
                        plugin.add_mappings(mappings)
                        inventory_dict.update(plugin.add_meta(fullpath))
                    except KeyError:
                        pass

                # add any autotags to inventory_dict
                if cliargs['autotag'] and len(
                        diskover.config['autotag_files']) > 0:
                    diskover_worker_bot.auto_tag(inventory_dict, 'file',
                                                 mtime_unix, None, None)

                tree.append(('file', inventory_dict))

            if len(tree) >= diskover.config['es_chunksize']:
                diskover_worker_bot.es_bulk_adder(tree, cliargs)
                del tree[:]
            x = x + 1

    if len(tree) > 0:
        diskover_worker_bot.es_bulk_adder(tree, cliargs)
    elapsed_time = round(time.time() - jobstart, 3)
    diskover_worker_bot.bot_logger.info('*** FINISHED JOB, Elapsed Time: ' +
                                        str(elapsed_time))
コード例 #5
0
def qumulo_get_file_meta(path, cliargs, reindex_dict, bot_logger):
    filename = path['name']

    # check if file is in exluded_files list
    extension = os.path.splitext(filename)[1][1:].strip().lower()
    if diskover.file_excluded(filename, extension, path['path'],
                              diskover.config, bot_logger, cliargs['verbose']):
        return None

    # get file size (bytes)
    size = path['size']

    # Skip files smaller than minsize cli flag
    if size < cliargs['minsize']:
        return None

    # check file modified time
    mtime_utc = path['modification_time']
    mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S'))

    # Convert time in days (mtime cli arg) to seconds
    time_sec = cliargs['mtime'] * 86400
    file_mtime_sec = time.time() - mtime_unix
    # Only process files modified at least x days ago
    if file_mtime_sec < time_sec:
        return None

    # get change time
    ctime_utc = path['change_time']
    # get creation time
    creation_time_utc = path['creation_time']

    # create md5 hash of file using metadata filesize and mtime
    filestring = str(size) + str(mtime_unix)
    filehash = hashlib.md5(filestring.encode('utf-8')).hexdigest()
    # get time
    indextime_utc = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")
    # get absolute path of parent directory
    parentdir = os.path.abspath(os.path.join(path['path'], os.pardir))

    # create file metadata dictionary
    filemeta_dict = {
        "filename": filename,
        "extension": extension,
        "path_parent": parentdir,
        "filesize": size,
        "owner": path['owner'],
        "group": path['group'],
        "last_modified": mtime_utc,
        "creation_time": creation_time_utc,
        "last_change": ctime_utc,
        "hardlinks": path['num_links'],
        "inode": path['id'],
        "filehash": filehash,
        "tag": "",
        "tag_custom": "",
        "dupe_md5": "",
        "indexing_date": indextime_utc,
        "worker_name": diskover_worker_bot.get_worker_name()
    }

    # search for and copy over any existing tags from reindex_dict
    for sublist in reindex_dict['file']:
        if sublist[0] == path['path']:
            filemeta_dict['tag'] = sublist[1]
            filemeta_dict['tag_custom'] = sublist[2]
            break

    # check plugins for adding extra meta data to filemeta_dict
    for plugin in diskover.plugins:
        try:
            # check if plugin is for file doc
            mappings = {'mappings': {'file': {'properties': {}}}}
            plugin.add_mappings(mappings)
            filemeta_dict.update(plugin.add_meta(path['path']))
        except KeyError:
            pass

    return filemeta_dict
コード例 #6
0
def qumulo_get_dir_meta(path, cliargs, reindex_dict, bot_logger, redis_conn):
    if path['path'] != '/':
        fullpath = path['path'].rstrip(os.path.sep)
    else:
        fullpath = path['path']
    mtime_utc = path['modification_time']
    mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S'))
    ctime_utc = path['change_time']
    ctime_unix = time.mktime(time.strptime(ctime_utc, '%Y-%m-%dT%H:%M:%S'))
    creation_time_utc = path['creation_time']
    if cliargs['index2']:
        # check if directory times cached in Redis
        redis_dirtime = redis_conn.get(
            fullpath.encode('utf-8', errors='ignore'))
        if redis_dirtime:
            cached_times = float(redis_dirtime.decode('utf-8'))
            # check if cached times are the same as on disk
            current_times = float(mtime_unix + ctime_unix)
            if cached_times == current_times:
                return "sametimes"
    # get time now in utc
    indextime_utc = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")

    filename = path['name']
    parentdir = os.path.abspath(os.path.join(fullpath, os.pardir))

    dirmeta_dict = {
        "filename": filename,
        "path_parent": parentdir,
        "filesize": 0,
        "items": 1,  # itself
        "last_modified": mtime_utc,
        "creation_time": creation_time_utc,
        "last_change": ctime_utc,
        "hardlinks": path['num_links'],
        "inode": path['id'],
        "owner": path['owner'],
        "group": path['group'],
        "tag": "",
        "tag_custom": "",
        "indexing_date": indextime_utc,
        "worker_name": diskover_worker_bot.get_worker_name()
    }

    # search for and copy over any existing tags from reindex_dict
    for sublist in reindex_dict['directory']:
        if sublist[0] == fullpath:
            dirmeta_dict['tag'] = sublist[1]
            dirmeta_dict['tag_custom'] = sublist[2]
            break

    # check plugins for adding extra meta data to dirmeta_dict
    for plugin in diskover.plugins:
        try:
            # check if plugin is for directory doc
            mappings = {'mappings': {'directory': {'properties': {}}}}
            plugin.add_mappings(mappings)
            dirmeta_dict.update(plugin.add_meta(fullpath))
        except KeyError:
            pass

    # cache directory times in Redis
    redis_conn.set(fullpath.encode('utf-8', errors='ignore'),
                   mtime_unix + ctime_unix,
                   ex=diskover.config['redis_dirtimesttl'])

    return dirmeta_dict