Esempio n. 1
0
def make_fake_s3_dir(parent, file, cliargs):
    """Make fake s3 directory function.
    Creates a fake directory doc for es.
    Returns dictionary for directory doc.
    """
    global fake_dirs

    fullpath = os.path.abspath(os.path.join(parent, file))

    if fullpath in fake_dirs:
        return None

    mtime_utc = "1970-01-01T00:00:00"
    mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S'))

    dir_dict = {}
    dir_dict['filename'] = file
    dir_dict['path_parent'] = parent
    dir_dict["filesize"] = 0
    dir_dict["items"] = 1  # 1 for itself
    dir_dict["items_files"] = 0
    dir_dict["items_subdirs"] = 0
    dir_dict["last_modified"] = mtime_utc
    dir_dict["tag"] = ""
    dir_dict["tag_custom"] = ""
    dir_dict["indexing_date"] = datetime.utcnow().isoformat()
    dir_dict["worker_name"] = workername
    dir_dict["change_percent_filesize"] = ""
    dir_dict["change_percent_items"] = ""
    dir_dict["change_percent_items_files"] = ""
    dir_dict["change_percent_items_subdirs"] = ""
    dir_dict["_type"] = "directory"

    # add any autotags to inventory_dict
    if cliargs['autotag'] and len(config['autotag_dirs']) > 0:
        auto_tag(dir_dict, 'directory', mtime_unix, None, None)

    # check plugins for adding extra meta data to dirmeta_dict
    for plugin in plugins:
        try:
            # check if plugin is for directory doc
            mappings = {'mappings': {'directory': {'properties': {}}}}
            plugin.add_mappings(mappings)
            dir_dict.update(plugin.add_meta(fullpath))
        except KeyError:
            pass

    # store in fake_dirs
    s3threadlock.acquire()
    fake_dirs.append(fullpath)
    s3threadlock.release()

    return dir_dict
Esempio n. 2
0
def qumulo_get_file_meta(worker_name, path, cliargs, reindex_dict):
    filename = path['name']

    # check if file is in exluded_files list
    extension = os.path.splitext(filename)[1][1:].strip().lower()
    if file_excluded(filename, extension):
        return None

    # get file size (bytes)
    size = int(path['size'])

    # Skip files smaller than minsize cli flag
    if size < cliargs['minsize']:
        return None

    # check file modified time
    mtime_utc = path['modification_time']
    mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S'))

    # Convert time in days (mtime cli arg) to seconds
    time_sec = cliargs['mtime'] * 86400
    file_mtime_sec = time.time() - mtime_unix
    # Only process files modified at least x days ago
    if file_mtime_sec < time_sec:
        return None

    # get change time
    ctime_utc = path['change_time']
    ctime_unix = time.mktime(time.strptime(ctime_utc, '%Y-%m-%dT%H:%M:%S'))
    # get creation time
    creation_time_utc = path['creation_time']

    # create md5 hash of file using metadata filesize and mtime
    filestring = str(size) + str(mtime_unix)
    filehash = hashlib.md5(filestring.encode('utf-8')).hexdigest()
    # get time
    indextime_utc = datetime.utcnow().isoformat()
    # get absolute path of parent directory
    parentdir = os.path.abspath(os.path.join(path['path'], os.pardir))
    # get user id of owner
    uid = path['owner']
    # try to get owner user name
    # first check cache
    if uid in uids:
        owner = owners[uid]
    # not in cache
    else:
        owner = uid
        # store it in cache
        if not uid in uids:
            uids.append(uid)
            owners[uid] = owner
    # get group id
    gid = path['group']
    # try to get group name
    # first check cache
    if gid in gids:
        group = groups[gid]
    # not in cache
    else:
        group = gid
        # store in cache
        if not gid in gids:
            gids.append(gid)
            groups[gid] = group

    # create file metadata dictionary
    filemeta_dict = {
        "filename": filename,
        "extension": extension,
        "path_parent": parentdir,
        "filesize": size,
        "owner": owner,
        "group": group,
        "last_modified": mtime_utc,
        "creation_time": creation_time_utc,
        "last_change": ctime_utc,
        "hardlinks": path['num_links'],
        "inode": str(path['id']),
        "filehash": filehash,
        "tag": "",
        "tag_custom": "",
        "dupe_md5": "",
        "indexing_date": indextime_utc,
        "worker_name": worker_name,
        "_type": "file"
    }

    # check plugins for adding extra meta data to filemeta_dict
    for plugin in plugins:
        try:
            # check if plugin is for file doc
            mappings = {'mappings': {'file': {'properties': {}}}}
            plugin.add_mappings(mappings)
            filemeta_dict.update(plugin.add_meta(path['path']))
        except KeyError:
            pass

    # add any autotags to filemeta_dict
    if cliargs['autotag'] and len(config['autotag_files']) > 0:
        auto_tag(filemeta_dict, 'file', mtime_unix, None, ctime_unix)

    # search for and copy over any existing tags from reindex_dict
    for sublist in reindex_dict['file']:
        if sublist[0] == path['path']:
            filemeta_dict['tag'] = sublist[1]
            filemeta_dict['tag_custom'] = sublist[2]
            break

    return filemeta_dict
Esempio n. 3
0
def qumulo_get_dir_meta(worker_name, path, cliargs, reindex_dict, redis_conn):
    if path['path'] != '/':
        fullpath = path['path'].rstrip(os.path.sep)
    else:
        fullpath = path['path']
    mtime_utc = path['modification_time']
    mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S'))
    ctime_utc = path['change_time']
    ctime_unix = time.mktime(time.strptime(ctime_utc, '%Y-%m-%dT%H:%M:%S'))
    creation_time_utc = path['creation_time']
    if cliargs['index2']:
        # check if directory times cached in Redis
        redis_dirtime = redis_conn.get(base64.encodestring(fullpath.encode('utf-8', errors='ignore')))
        if redis_dirtime:
            cached_times = float(redis_dirtime.decode('utf-8'))
            # check if cached times are the same as on disk
            current_times = float(mtime_unix + ctime_unix)
            if cached_times == current_times:
                return "sametimes"
    # get time now in utc
    indextime_utc = datetime.utcnow().isoformat()
    # get user id of owner
    uid = path['owner']
    # try to get owner user name
    # first check cache
    if uid in uids:
        owner = owners[uid]
    # not in cache
    else:
        owner = uid
        # store it in cache
        if not uid in uids:
            uids.append(uid)
            owners[uid] = owner
    # get group id
    gid = path['group']
    # try to get group name
    # first check cache
    if gid in gids:
        group = groups[gid]
    # not in cache
    else:
        group = gid
        # store in cache
        if not gid in gids:
            gids.append(gid)
            groups[gid] = group

    filename = path['name']
    parentdir = os.path.abspath(os.path.join(fullpath, os.pardir))

    dirmeta_dict = {
        "filename": filename,
        "path_parent": parentdir,
        "filesize": 0,
        "items": 1,  # 1 for itself
        "items_files": 0,
        "items_subdirs": 0,
        "last_modified": mtime_utc,
        "creation_time": creation_time_utc,
        "last_change": ctime_utc,
        "hardlinks": path['num_links'],
        "inode": str(path['id']),
        "owner": owner,
        "group": group,
        "tag": "",
        "tag_custom": "",
        "indexing_date": indextime_utc,
        "worker_name": worker_name,
        "change_percent_filesize": "",
        "change_percent_items": "",
        "change_percent_items_files": "",
        "change_percent_items_subdirs": "",
        "_type": "directory"
    }

    # check plugins for adding extra meta data to dirmeta_dict
    for plugin in plugins:
        try:
            # check if plugin is for directory doc
            mappings = {'mappings': {'directory': {'properties': {}}}}
            plugin.add_mappings(mappings)
            dirmeta_dict.update(plugin.add_meta(fullpath))
        except KeyError:
            pass

    # add any autotags to dirmeta_dict
    if cliargs['autotag'] and len(config['autotag_dirs']) > 0:
        auto_tag(dirmeta_dict, 'directory', mtime_unix, None, ctime_unix)

    # search for and copy over any existing tags from reindex_dict
    for sublist in reindex_dict['directory']:
        if sublist[0] == fullpath:
            dirmeta_dict['tag'] = sublist[1]
            dirmeta_dict['tag_custom'] = sublist[2]
            break

    # cache directory times in Redis
    if config['redis_cachedirtimes'] == 'True' or config['redis_cachedirtimes'] == 'true':
        redis_conn.set(base64.encodestring(fullpath.encode('utf-8', errors='ignore')), mtime_unix + ctime_unix,
                       ex=config['redis_dirtimesttl'])

    return dirmeta_dict
Esempio n. 4
0
def process_line(row, tree_dirs, tree_files, cliargs):
    global fake_dirs

    n = 2
    # S3 Inventory csv column headers
    inventory_dict = {'s3_bucket': row[0], 's3_key': row[1]}
    try:
        inventory_dict['s3_size'] = int(row[n])
        n = n + 1
    except IndexError:
        pass
    try:
        inventory_dict['s3_last_modified_date'] = row[n]
        n = n + 1
    except IndexError:
        pass
    try:
        inventory_dict['s3_etag'] = row[n]
        n = n + 1
    except IndexError:
        pass
    try:
        inventory_dict['s3_storage_class'] = row[n]
        n = n + 1
    except IndexError:
        pass
    try:
        inventory_dict['s3_multipart_upload'] = row[n]
        n = n + 1
    except IndexError:
        pass
    try:
        inventory_dict['s3_replication_status'] = row[n]
        n = n + 1
    except IndexError:
        pass
    try:
        inventory_dict['s3_encryption_status'] = row[n]
    except IndexError:
        pass

    # prepare inventory dict for diskover index

    # fake path /s3/bucketname/key
    bucket = '/s3/' + row[0] + '/'
    path = os.path.join(bucket, inventory_dict['s3_key'])
    # check if directory
    if path.endswith('/'):
        isdir = True
        path = path.rstrip('/')
        s3threadlock.acquire()
        fake_dirs.append(path)
        s3threadlock.release()
    else:
        isdir = False
        # add any directories in path to fake dirs
        splitpath = inventory_dict['s3_key'].split('/')
        # remove file name
        splitpath = splitpath[:-1]
        prev_path = bucket.rstrip('/')
        for p in splitpath:
            # create fake directory entry
            s3threadlock.acquire()
            dir_dict = make_fake_s3_dir(prev_path, p, cliargs)
            s3threadlock.release()
            current_path = os.path.join(prev_path, p)
            if dir_dict is None:
                prev_path = current_path
                continue
            tree_dirs.append(dir_dict)
            # increment items counts of parentdir
            for d in tree_dirs:
                if d['filename'] == os.path.basename(dir_dict['path_parent']):
                    d['items_subdirs'] += 1
                    d['items'] += 1
                    break
            prev_path = current_path

    size = inventory_dict['s3_size']
    # filename
    filename = os.path.basename(path)
    # check if file is in exluded_files list
    extension = os.path.splitext(filename)[1][1:].strip().lower()
    if file_excluded(filename, extension, path, cliargs['verbose']):
        return tree_dirs, tree_files
    # Skip files smaller than minsize cli flag
    if not isdir and size < cliargs['minsize']:
        return tree_dirs, tree_files
    # modified time
    mtime_utc = inventory_dict['s3_last_modified_date'].partition('.')[0]
    # modified time in unix
    mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S'))
    # get time
    indextime_utc = datetime.utcnow().isoformat()
    # get absolute path of parent directory
    parentdir = os.path.abspath(os.path.join(path, os.pardir))
    # absolute full path
    fullpath = os.path.abspath(os.path.join(parentdir, filename))

    # remove any keys (fields) we don't want to add to ES
    inventory_dict.pop('s3_size', None)
    inventory_dict.pop('s3_last_modified_date', None)

    if isdir:  # directory
        inventory_dict['filename'] = filename
        inventory_dict['path_parent'] = parentdir
        inventory_dict["filesize"] = 0
        inventory_dict["items"] = 1  # 1 for itself
        inventory_dict["items_files"] = 0
        inventory_dict["items_subdirs"] = 0
        inventory_dict["last_modified"] = mtime_utc
        inventory_dict["tag"] = ""
        inventory_dict["tag_custom"] = ""
        inventory_dict["indexing_date"] = indextime_utc
        inventory_dict["worker_name"] = workername
        inventory_dict["change_percent_filesize"] = ""
        inventory_dict["change_percent_items"] = ""
        inventory_dict["change_percent_items_files"] = ""
        inventory_dict["change_percent_items_subdirs"] = ""
        inventory_dict["_type"] = "directory"

        # increment items counts of parentdir
        for d in tree_dirs:
            if d['filename'] == os.path.basename(parentdir):
                d['items_subdirs'] += 1
                d['items'] += 1
                break

        # add any autotags to inventory_dict
        if cliargs['autotag'] and len(config['autotag_dirs']) > 0:
            auto_tag(inventory_dict, 'directory', mtime_unix, None, None)

        # check plugins for adding extra meta data to dirmeta_dict
        for plugin in plugins:
            try:
                # check if plugin is for directory doc
                mappings = {'mappings': {'directory': {'properties': {}}}}
                plugin.add_mappings(mappings)
                inventory_dict.update(plugin.add_meta(fullpath))
            except KeyError:
                pass

        tree_dirs.append(inventory_dict)

    else:  # file
        # Convert time in days (mtime cli arg) to seconds
        time_sec = cliargs['mtime'] * 86400
        file_mtime_sec = time.time() - mtime_unix
        # Only process files modified at least x days ago
        if file_mtime_sec < time_sec:
            return tree_files, tree_dirs
        # create md5 hash of file using metadata filesize and mtime
        filestring = str(size) + str(mtime_unix)
        filehash = hashlib.md5(filestring.encode('utf-8')).hexdigest()

        inventory_dict['filename'] = filename
        inventory_dict['path_parent'] = parentdir
        inventory_dict["extension"] = extension
        inventory_dict["filesize"] = size
        inventory_dict["last_modified"] = mtime_utc
        inventory_dict["filehash"] = filehash
        inventory_dict["tag"] = ""
        inventory_dict["tag_custom"] = ""
        inventory_dict["dupe_md5"] = ""
        inventory_dict["indexing_date"] = indextime_utc
        inventory_dict["worker_name"] = workername
        inventory_dict["_type"] = "file"

        # add file size and increment items counts to parentdir
        for d in tree_dirs:
            if d['filename'] == os.path.basename(parentdir):
                d['filesize'] += size
                d['items_files'] += 1
                d['items'] += 1
                break

        # check plugins for adding extra meta data to inventory_dict
        for plugin in plugins:
            try:
                # check if plugin is for file doc
                mappings = {'mappings': {'file': {'properties': {}}}}
                plugin.add_mappings(mappings)
                inventory_dict.update(plugin.add_meta(fullpath))
            except KeyError:
                pass

        # add any autotags to inventory_dict
        if cliargs['autotag'] and len(config['autotag_files']) > 0:
            auto_tag(inventory_dict, 'file', mtime_unix, None, None)

        tree_files.append(inventory_dict)

    return tree_dirs, tree_files