Beispiel #1
0
def index_dupes(hashgroup, cliargs, bot_logger):
    """This is the ES dupe_md5 tag update function.
    It updates a file's dupe_md5 field to be md5sum of file
    if it's marked as a duplicate.
    """
    # create Elasticsearch connection
    es = diskover.elasticsearch_connect(diskover.config)
    file_id_list = []
    # bulk update data in Elasticsearch index
    for f in hashgroup['files']:
        d = {
            '_op_type': 'update',
            '_index': cliargs['index'],
            '_type': 'file',
            '_id': f['id'],
            'doc': {
                'dupe_md5': hashgroup['md5sum']
            }
        }
        file_id_list.append(d)
    if len(file_id_list) > 0:
        if cliargs['verbose']:
            bot_logger.info('Bulk updating %s files in ES index' %
                            len(file_id_list))
        diskover.index_bulk_add(es, file_id_list, 'file', diskover.config,
                                cliargs)
Beispiel #2
0
def populate_hashgroup(key, cliargs):
    """Searches ES for all files matching hashgroup key (filehash)
    and returns dict containing matching files.
    """

    bot_logger = diskover_worker_bot.bot_logger

    # create Elasticsearch connection
    es = diskover.elasticsearch_connect(diskover.config)

    if cliargs['verbose']:
        bot_logger.info('Searching ES for all files matching hash key %s' %
                        key)

    hashgroup_files = []

    data = {
        "_source": ["path_parent", "filename"],
        "query": {
            "bool": {
                "must": {
                    "term": {
                        "filehash": key
                    }
                }
            }
        }
    }
    # refresh index
    # ES.indices.refresh(index=cliargs['index'])
    res = es.search(index=cliargs['index'],
                    doc_type='file',
                    size="1000",
                    body=data,
                    request_timeout=diskover.config['es_timeout'])

    # add any hits to hashgroups
    for hit in res['hits']['hits']:
        hashgroup_files.append({
            'id':
            hit['_id'],
            'filename':
            hit['_source']['path_parent'] + "/" + hit['_source']['filename']
        })

    if cliargs['verbose']:
        bot_logger.info('Found %s files matching hash key %s' %
                        (len(hashgroup_files), key))

    # return filehash group and add to queue
    fhg = {'filehash': key, 'files': hashgroup_files, 'md5sum': ''}

    return fhg
Beispiel #3
0
import os
import hashlib
import socket
import pwd
import grp
import time
import logging

# cache uid/gid names
uids = []
gids = []
owners = {}
groups = {}

# create Elasticsearch connection
es = diskover.elasticsearch_connect(diskover.config)

# create Reddis connection
redis_conn = Redis(host=diskover.config['redis_host'],
                   port=diskover.config['redis_port'],
                   password=diskover.config['redis_password'])


def parse_cli_args():
    """This is the parse CLI arguments function.
    It parses command line arguments.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-b",
        "--burst",