Ejemplo n.º 1
0
def get_files(index, path):
    newpath = escape_chars(path)
    if newpath == '\/':
        newpathwildcard = '\/*'
    else:
        newpathwildcard = newpath + '\/*'
    logger.info('Searching for all file docs in %s for path %s...', index,
                path)
    data = {
        '_source': [
            'path_parent', 'filename', 'last_modified', 'last_access',
            'last_change'
        ],
        'query': {
            'query_string': {
                'query':
                '(path_parent: ' + newpath + ') OR '
                '(path_parent: ' + newpathwildcard + ') OR (filename: "' +
                os.path.basename(path) + '" AND path_parent: "' +
                os.path.abspath(os.path.join(path, os.pardir)) + '")',
            }
        }
    }
    es.indices.refresh(index)
    res = es.search(index=index,
                    doc_type='file',
                    scroll='1m',
                    size=config['es_scrollsize'],
                    body=data,
                    request_timeout=config['es_timeout'])
    filelist = []
    filelist_hashed = []
    filelist_times = []
    doccount = 0
    while res['hits']['hits'] and len(res['hits']['hits']) > 0:
        for hit in res['hits']['hits']:
            fullpath = os.path.abspath(
                os.path.join(hit['_source']['path_parent'],
                             hit['_source']['filename']))
            mtime = time.mktime(
                datetime.strptime(hit['_source']['last_modified'],
                                  '%Y-%m-%dT%H:%M:%S').timetuple())
            ctime = time.mktime(
                datetime.strptime(hit['_source']['last_change'],
                                  '%Y-%m-%dT%H:%M:%S').timetuple())
            atime = time.mktime(
                datetime.strptime(hit['_source']['last_access'],
                                  '%Y-%m-%dT%H:%M:%S').timetuple())
            filelist.append(fullpath)
            filelist_hashed.append(
                hashlib.md5(fullpath.encode('utf-8')).hexdigest())
            filelist_times.append((mtime, ctime, atime))
            doccount += 1
        # use es scroll api
        res = es.scroll(scroll_id=res['_scroll_id'],
                        scroll='1m',
                        request_timeout=config['es_timeout'])
    logger.info('Found %s file docs' % str(doccount))
    return filelist, filelist_hashed, filelist_times
Ejemplo n.º 2
0
def populate_hashgroup(key, cliargs):
    """Searches ES for all files matching hashgroup key (filehash)
    and returns dict containing matching files.
    Return None if only 1 file matching.
    """

    hashgroup_files = []

    data = {
        "_source":
        ["path_parent", "filename", "last_access", "last_modified", "type"],
        "query": {
            "bool": {
                "must": {
                    "term": {
                        "filehash": key
                    }
                },
                "filter": {
                    "term": {
                        "type": "file"
                    }
                }
            }
        }
    }
    res = es.search(index=cliargs['index'],
                    size="1000",
                    body=data,
                    request_timeout=config['es_timeout'])

    # return None if only 1 matching file
    if len(res['hits']['hits']) == 1:
        return None

    # add any hits to hashgroups
    for hit in res['hits']['hits']:
        hashgroup_files.append({
            'id':
            hit['_id'],
            'filename':
            hit['_source']['path_parent'] + "/" + hit['_source']['filename'],
            'atime':
            hit['_source']['last_access'],
            'mtime':
            hit['_source']['last_modified']
        })

    # return filehash group and add to queue
    fhg = {'filehash': key, 'files': hashgroup_files, 'md5sum': ''}

    return fhg
Ejemplo n.º 3
0
def dupes_finder(es, q, cliargs, logger):
    """This is the duplicate file finder function.
    It searches Elasticsearch for files that have the same filehashes
    and adds file hash groups to Queue.
    """

    logger.info('Searching %s for all dupe filehashes...', cliargs['index'])

    # first get all the filehashes with files that have a hardlinks count of 1
    if cliargs['inchardlinks']:
        data = {
            "size":
            0,
            "_source": [
                'filename', 'filehash', 'path_parent', 'last_modified',
                'last_access'
            ],
            "query": {
                "bool": {
                    "must": {
                        "range": {
                            "filesize": {
                                "lte": config['dupes_maxsize'],
                                "gte": cliargs['minsize']
                            }
                        }
                    }
                }
            }
        }
    else:
        data = {
            "size":
            0,
            "_source": [
                'filename', 'filehash', 'path_parent', 'last_modified',
                'last_access'
            ],
            "query": {
                "bool": {
                    "must": {
                        "term": {
                            "hardlinks": 1
                        }
                    },
                    "filter": {
                        "range": {
                            "filesize": {
                                "lte": config['dupes_maxsize'],
                                "gte": cliargs['minsize']
                            }
                        }
                    }
                }
            }
        }

    # refresh index
    es.indices.refresh(index=cliargs['index'])
    # search es and start scroll
    res = es.search(index=cliargs['index'],
                    doc_type='file',
                    scroll='1m',
                    size=config['es_scrollsize'],
                    body=data,
                    request_timeout=config['es_timeout'])

    filehashes = {}
    while res['hits']['hits'] and len(res['hits']['hits']) > 0:
        for hit in res['hits']['hits']:
            filehash = hit['_source']['filehash']
            filepath = os.path.join(hit['_source']['path_parent'],
                                    hit['_source']['filename'])
            if filehash in filehashes:
                filehashes[filehash].append({
                    'id':
                    hit['_id'],
                    'filename':
                    filepath,
                    'atime':
                    hit['_source']['last_access'],
                    'mtime':
                    hit['_source']['last_modified'],
                    'md5':
                    ''
                })
            else:
                filehashes[filehash] = [{
                    'id':
                    hit['_id'],
                    'filename':
                    filepath,
                    'atime':
                    hit['_source']['last_access'],
                    'mtime':
                    hit['_source']['last_modified'],
                    'md5':
                    ''
                }]

        # use es scroll api
        res = es.scroll(scroll_id=res['_scroll_id'],
                        scroll='1m',
                        request_timeout=config['es_timeout'])

    possibledupescount = 0
    for key, value in list(filehashes.items()):
        filehash_filecount = len(value)
        if filehash_filecount < 2:
            del filehashes[key]
        else:
            possibledupescount += filehash_filecount

    logger.info('Found %s possible dupe files', possibledupescount)
    if possibledupescount == 0:
        return

    logger.info('Starting to enqueue dupe file hashes...')

    if cliargs['adaptivebatch']:
        batchsize = ab_start
    else:
        batchsize = cliargs['batchsize']
    if cliargs['verbose'] or cliargs['debug']:
        logger.info('Batch size: %s' % batchsize)

    n = 0
    hashgroups = []
    for key, value in filehashes.items():
        if cliargs['verbose'] or cliargs['debug']:
            logger.info('filehash: %s, filecount: %s' % (key, len(value)))
        hashgroups.append({'filehash': key, 'files': value})
        n += 1
        if n >= batchsize:
            # send to rq for bots to process hashgroups list
            q.enqueue(dupes_process_hashkeys,
                      args=(
                          hashgroups,
                          cliargs,
                      ),
                      result_ttl=config['redis_ttl'])
            if cliargs['debug'] or cliargs['verbose']:
                logger.info("enqueued batchsize: %s (batchsize: %s)" %
                            (n, batchsize))
            del hashgroups[:]
            n = 0
            if cliargs['adaptivebatch']:
                batchsize = adaptive_batch(q, cliargs, batchsize)
                if cliargs['debug'] or cliargs['verbose']:
                    logger.info("batchsize set to: %s" % batchsize)

    # enqueue dir calc job for any remaining in dirlist
    if n > 0:
        q.enqueue(dupes_process_hashkeys,
                  args=(
                      hashgroups,
                      cliargs,
                  ),
                  result_ttl=config['redis_ttl'])

    logger.info(
        '%s possible dupe file hashes have been enqueued, worker bots processing dupes...'
        % possibledupescount)

    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        bar = progress_bar('Checking')
        bar.start()
    else:
        bar = None

    # update progress bar until bots are idle and queue is empty
    while worker_bots_busy([q]):
        if bar:
            q_len = len(q)
            try:
                bar.update(q_len)
            except (ZeroDivisionError, ValueError):
                bar.update(0)
        time.sleep(1)

    if bar:
        bar.finish()
Ejemplo n.º 4
0
def dupes_finder(es, q, cliargs, logger):
    """This is the duplicate file finder function.
    It searches Elasticsearch for files that have the same filehashes
    and adds file hash groups to Queue.
    """

    logger.info('Searching %s for duplicate file hashes...', cliargs['index'])

    # find the filehashes with largest files and add filehash keys
    # to hashgroups
    data = {
        "size": 0,
        "query": {
            "bool": {
                "must": {
                    "term": {
                        "hardlinks": 1
                    }
                },
                "filter": {
                    "range": {
                        "filesize": {
                            "lte": config['dupes_maxsize'],
                            "gte": cliargs['minsize']
                        }
                    }
                }
            }
        },
        "aggs": {
            "dupe_filehash": {
                "terms": {
                    "field": "filehash",
                    "min_doc_count": 2,
                    "size": 10000,
                    "order": {
                        "max_file_size": "desc"
                    }
                },
                "aggs": {
                    "max_file_size": {
                        "max": {
                            "field": "filesize"
                        }
                    }
                }
            }
        }
    }

    # refresh index
    es.indices.refresh(index=cliargs['index'])
    res = es.search(index=cliargs['index'],
                    doc_type='file',
                    body=data,
                    request_timeout=config['es_timeout'])

    logger.info('Found %s duplicate file hashes, enqueueing...',
                len(res['aggregations']['dupe_filehash']['buckets']))

    # add hash keys to Queue
    for bucket in res['aggregations']['dupe_filehash']['buckets']:
        q.enqueue(dupes_process_hashkey,
                  args=(
                      bucket['key'],
                      cliargs,
                  ),
                  result_ttl=config['redis_ttl'])

    logger.info('All file hashes have been enqueued')

    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        bar = progress_bar('Checking')
        bar.start()
    else:
        bar = None

    # wait for queue to be empty and update progress bar
    time.sleep(1)
    while True:
        workers_busy = False
        workers = SimpleWorker.all(connection=redis_conn)
        for worker in workers:
            if worker._state == "busy":
                workers_busy = True
                break
        q_len = len(q)
        if not cliargs['quiet'] and not cliargs['debug'] and not cliargs[
                'verbose']:
            try:
                bar.update(q_len)
            except ZeroDivisionError:
                bar.update(0)
            except ValueError:
                bar.update(0)
        if q_len == 0 and workers_busy == False:
            break
        time.sleep(.5)

    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        bar.finish()
Ejemplo n.º 5
0
def dupes_finder(es, q, cliargs, logger):
    """This is the duplicate file finder function.
    It searches Elasticsearch for files that have the same filehashes
    and adds file hash groups to Queue.
    """

    logger.info('Searching %s for all duplicate files...', cliargs['index'])

    if cliargs['adaptivebatch']:
        batchsize = ab_start
    else:
        batchsize = cliargs['batchsize']
    if cliargs['verbose'] or cliargs['debug']:
        logger.info('Batch size: %s' % batchsize)

    # first get all the filehashes with files that have a hardlinks count of 1
    data = {
        "size": 0,
        "query": {
            "bool": {
                "must": {
                    "term": {"hardlinks": 1}
                },
                "filter": {
                    "range": {
                        "filesize": {
                            "lte": config['dupes_maxsize'],
                            "gte": cliargs['minsize']
                        }
                    }
                }
            }
        }
    }

    # refresh index
    es.indices.refresh(index=cliargs['index'])
    # search es and start scroll
    res = es.search(index=cliargs['index'], scroll='1m', doc_type='file', size=config['es_scrollsize'],
                    body=data, request_timeout=config['es_timeout'])

    filehashlist = []
    filehashcount = 0
    while res['hits']['hits'] and len(res['hits']['hits']) > 0:
        for hit in res['hits']['hits']:
            filehash = hit['_source']['filehash']
            if filehash not in filehashlist:
                filehashlist.append(filehash)
                filehashcount += 1
                filehashlist_len = len(filehashlist)
                if filehashlist_len >= batchsize:
                    # send to rq for bots to process file hashkey list
                    q.enqueue(dupes_process_hashkey, args=(filehashlist, cliargs,), result_ttl=config['redis_ttl'])
                    if cliargs['debug'] or cliargs['verbose']:
                        logger.info("enqueued batchsize: %s (batchsize: %s)" % (filehashlist_len, batchsize))
                    del filehashlist[:]
                    if cliargs['adaptivebatch']:
                        batchsize = adaptive_batch(q, cliargs, batchsize)
                        if cliargs['debug'] or cliargs['verbose']:
                            logger.info("batchsize set to: %s" % batchsize)

        # use es scroll api
        res = es.scroll(scroll_id=res['_scroll_id'], scroll='1m',
                        request_timeout=config['es_timeout'])

    # enqueue dir calc job for any remaining in dirlist
    if len(filehashlist) > 0:
        q.enqueue(dupes_process_hashkey, args=(filehashlist, cliargs,), result_ttl=config['redis_ttl'])

    logger.info('%s file hashes have been enqueued' % filehashcount)

    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        bar = progress_bar('Checking')
        bar.start()
    else:
        bar = None

    # update progress bar until bots are idle and queue is empty
    while worker_bots_busy([q]):
        if bar:
            q_len = len(q)
            try:
                bar.update(q_len)
            except (ZeroDivisionError, ValueError):
                bar.update(0)
        time.sleep(1)

    if bar:
        bar.finish()