Example #1
0
def start_importing(es, cliargs, logger):
    """Start importing s3 inventory file function.
    """

    for i in range(4):
        thread = Thread(target=csv_file_reader, args=(s3queue, ))
        thread.daemon = True
        thread.start()

    # start importing S3 inventory file(s)
    inventory_files = cliargs['s3']
    logger.info('Importing %s S3 inventory file(s)...' % len(inventory_files))

    # add fake disk space to index with path set to /s3
    data = {
        "path": '/s3',
        "total": 0,
        "used": 0,
        "free": 0,
        "available": 0,
        "indexing_date": datetime.utcnow().isoformat()
    }
    es.index(index=cliargs['index'], doc_type='diskspace', body=data)

    # add all s3 inventory files to queue
    for file in inventory_files:
        s3queue.put((file, cliargs))

    # set up progress bar
    bar = progress_bar('Importing')
    bar.start()

    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        i = 1
        while s3queue.qsize() > 0:
            try:
                percent = int("{0:.0f}".format(
                    100 * ((len(inventory_files) - s3queue.qsize()) /
                           float(len(inventory_files)))))
                bar.update(percent)
            except ZeroDivisionError:
                bar.update(0)
            except ValueError:
                bar.update(0)
            time.sleep(.5)
            i += 1
        bar.finish()

    # wait for queue to be empty
    s3queue.join()
Example #2
0
def dupes_finder(es, q, cliargs, logger):
    """This is the duplicate file finder function.
    It searches Elasticsearch for files that have the same filehashes
    and adds file hash groups to Queue.
    """

    logger.info('Searching %s for all dupe filehashes...', cliargs['index'])

    # first get all the filehashes with files that have a hardlinks count of 1
    if cliargs['inchardlinks']:
        data = {
            "size":
            0,
            "_source": [
                'filename', 'filehash', 'path_parent', 'last_modified',
                'last_access'
            ],
            "query": {
                "bool": {
                    "must": {
                        "range": {
                            "filesize": {
                                "lte": config['dupes_maxsize'],
                                "gte": cliargs['minsize']
                            }
                        }
                    }
                }
            }
        }
    else:
        data = {
            "size":
            0,
            "_source": [
                'filename', 'filehash', 'path_parent', 'last_modified',
                'last_access'
            ],
            "query": {
                "bool": {
                    "must": {
                        "term": {
                            "hardlinks": 1
                        }
                    },
                    "filter": {
                        "range": {
                            "filesize": {
                                "lte": config['dupes_maxsize'],
                                "gte": cliargs['minsize']
                            }
                        }
                    }
                }
            }
        }

    # refresh index
    es.indices.refresh(index=cliargs['index'])
    # search es and start scroll
    res = es.search(index=cliargs['index'],
                    doc_type='file',
                    scroll='1m',
                    size=config['es_scrollsize'],
                    body=data,
                    request_timeout=config['es_timeout'])

    filehashes = {}
    while res['hits']['hits'] and len(res['hits']['hits']) > 0:
        for hit in res['hits']['hits']:
            filehash = hit['_source']['filehash']
            filepath = os.path.join(hit['_source']['path_parent'],
                                    hit['_source']['filename'])
            if filehash in filehashes:
                filehashes[filehash].append({
                    'id':
                    hit['_id'],
                    'filename':
                    filepath,
                    'atime':
                    hit['_source']['last_access'],
                    'mtime':
                    hit['_source']['last_modified'],
                    'md5':
                    ''
                })
            else:
                filehashes[filehash] = [{
                    'id':
                    hit['_id'],
                    'filename':
                    filepath,
                    'atime':
                    hit['_source']['last_access'],
                    'mtime':
                    hit['_source']['last_modified'],
                    'md5':
                    ''
                }]

        # use es scroll api
        res = es.scroll(scroll_id=res['_scroll_id'],
                        scroll='1m',
                        request_timeout=config['es_timeout'])

    possibledupescount = 0
    for key, value in list(filehashes.items()):
        filehash_filecount = len(value)
        if filehash_filecount < 2:
            del filehashes[key]
        else:
            possibledupescount += filehash_filecount

    logger.info('Found %s possible dupe files', possibledupescount)
    if possibledupescount == 0:
        return

    logger.info('Starting to enqueue dupe file hashes...')

    if cliargs['adaptivebatch']:
        batchsize = ab_start
    else:
        batchsize = cliargs['batchsize']
    if cliargs['verbose'] or cliargs['debug']:
        logger.info('Batch size: %s' % batchsize)

    n = 0
    hashgroups = []
    for key, value in filehashes.items():
        if cliargs['verbose'] or cliargs['debug']:
            logger.info('filehash: %s, filecount: %s' % (key, len(value)))
        hashgroups.append({'filehash': key, 'files': value})
        n += 1
        if n >= batchsize:
            # send to rq for bots to process hashgroups list
            q.enqueue(dupes_process_hashkeys,
                      args=(
                          hashgroups,
                          cliargs,
                      ),
                      result_ttl=config['redis_ttl'])
            if cliargs['debug'] or cliargs['verbose']:
                logger.info("enqueued batchsize: %s (batchsize: %s)" %
                            (n, batchsize))
            del hashgroups[:]
            n = 0
            if cliargs['adaptivebatch']:
                batchsize = adaptive_batch(q, cliargs, batchsize)
                if cliargs['debug'] or cliargs['verbose']:
                    logger.info("batchsize set to: %s" % batchsize)

    # enqueue dir calc job for any remaining in dirlist
    if n > 0:
        q.enqueue(dupes_process_hashkeys,
                  args=(
                      hashgroups,
                      cliargs,
                  ),
                  result_ttl=config['redis_ttl'])

    logger.info(
        '%s possible dupe file hashes have been enqueued, worker bots processing dupes...'
        % possibledupescount)

    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        bar = progress_bar('Checking')
        bar.start()
    else:
        bar = None

    # update progress bar until bots are idle and queue is empty
    while worker_bots_busy([q]):
        if bar:
            q_len = len(q)
            try:
                bar.update(q_len)
            except (ZeroDivisionError, ValueError):
                bar.update(0)
        time.sleep(1)

    if bar:
        bar.finish()
Example #3
0
def start_importing(es, cliargs, logger):
    """Start importing s3 inventory file function.
    """

    for i in range(4):
        thread = Thread(target=csv_file_reader, args=(s3queue, ))
        thread.daemon = True
        thread.start()

    # start importing S3 inventory file(s)
    inventory_files = cliargs['s3']
    logger.info('Importing %s S3 inventory file(s)...' % len(inventory_files))

    # add fake disk space to index with path set to /s3
    data = {
        "path": '/s3',
        "total": 0,
        "used": 0,
        "free": 0,
        "available": 0,
        "indexing_date": datetime.utcnow().isoformat()
    }
    es.index(index=cliargs['index'], doc_type='diskspace', body=data)

    # create fake root directory doc
    time_utc_now = datetime.utcnow().isoformat()
    time_utc_epoch_start = "1970-01-01T00:00:00"
    root_dict = {}
    root_dict['filename'] = "s3"
    root_dict['path_parent'] = "/"
    root_dict["filesize"] = 0
    root_dict["items"] = 1  # 1 for itself
    root_dict["items_files"] = 0
    root_dict["items_subdirs"] = 0
    root_dict["last_modified"] = time_utc_epoch_start
    root_dict["tag"] = ""
    root_dict["tag_custom"] = ""
    root_dict["indexing_date"] = time_utc_now
    root_dict["worker_name"] = "main"
    root_dict["change_percent_filesize"] = ""
    root_dict["change_percent_items"] = ""
    root_dict["change_percent_items_files"] = ""
    root_dict["change_percent_items_subdirs"] = ""
    es.index(index=cliargs['index'], doc_type='directory', body=root_dict)
    diskover.add_crawl_stats(es, cliargs['index'], '/s3', 0)

    # add all s3 inventory files to queue
    for file in inventory_files:
        s3queue.put((file, cliargs))

    # set up progress bar
    bar = diskover.progress_bar('Importing')
    bar.start()

    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        i = 1
        while s3queue.qsize() > 0:
            try:
                percent = int("{0:.0f}".format(
                    100 * ((len(inventory_files) - s3queue.qsize()) /
                           float(len(inventory_files)))))
                bar.update(percent)
            except ZeroDivisionError:
                bar.update(0)
            except ValueError:
                bar.update(0)
            time.sleep(.5)
            i += 1
        bar.finish()

    # wait for queue to be empty
    s3queue.join()
Example #4
0
def dupes_finder(es, q, cliargs, logger):
    """This is the duplicate file finder function.
    It searches Elasticsearch for files that have the same filehashes
    and adds file hash groups to Queue.
    """

    logger.info('Searching %s for duplicate file hashes...', cliargs['index'])

    # find the filehashes with largest files and add filehash keys
    # to hashgroups
    data = {
        "size": 0,
        "query": {
            "bool": {
                "must": {
                    "term": {
                        "hardlinks": 1
                    }
                },
                "filter": {
                    "range": {
                        "filesize": {
                            "lte": config['dupes_maxsize'],
                            "gte": cliargs['minsize']
                        }
                    }
                }
            }
        },
        "aggs": {
            "dupe_filehash": {
                "terms": {
                    "field": "filehash",
                    "min_doc_count": 2,
                    "size": 10000,
                    "order": {
                        "max_file_size": "desc"
                    }
                },
                "aggs": {
                    "max_file_size": {
                        "max": {
                            "field": "filesize"
                        }
                    }
                }
            }
        }
    }

    # refresh index
    es.indices.refresh(index=cliargs['index'])
    res = es.search(index=cliargs['index'],
                    doc_type='file',
                    body=data,
                    request_timeout=config['es_timeout'])

    logger.info('Found %s duplicate file hashes, enqueueing...',
                len(res['aggregations']['dupe_filehash']['buckets']))

    # add hash keys to Queue
    for bucket in res['aggregations']['dupe_filehash']['buckets']:
        q.enqueue(dupes_process_hashkey,
                  args=(
                      bucket['key'],
                      cliargs,
                  ),
                  result_ttl=config['redis_ttl'])

    logger.info('All file hashes have been enqueued')

    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        bar = progress_bar('Checking')
        bar.start()
    else:
        bar = None

    # wait for queue to be empty and update progress bar
    time.sleep(1)
    while True:
        workers_busy = False
        workers = SimpleWorker.all(connection=redis_conn)
        for worker in workers:
            if worker._state == "busy":
                workers_busy = True
                break
        q_len = len(q)
        if not cliargs['quiet'] and not cliargs['debug'] and not cliargs[
                'verbose']:
            try:
                bar.update(q_len)
            except ZeroDivisionError:
                bar.update(0)
            except ValueError:
                bar.update(0)
        if q_len == 0 and workers_busy == False:
            break
        time.sleep(.5)

    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        bar.finish()
Example #5
0
def dupes_finder(es, q, cliargs, logger):
    """This is the duplicate file finder function.
    It searches Elasticsearch for files that have the same filehashes
    and adds file hash groups to Queue.
    """

    logger.info('Searching %s for all duplicate files...', cliargs['index'])

    if cliargs['adaptivebatch']:
        batchsize = ab_start
    else:
        batchsize = cliargs['batchsize']
    if cliargs['verbose'] or cliargs['debug']:
        logger.info('Batch size: %s' % batchsize)

    # first get all the filehashes with files that have a hardlinks count of 1
    data = {
        "size": 0,
        "query": {
            "bool": {
                "must": {
                    "term": {"hardlinks": 1}
                },
                "filter": {
                    "range": {
                        "filesize": {
                            "lte": config['dupes_maxsize'],
                            "gte": cliargs['minsize']
                        }
                    }
                }
            }
        }
    }

    # refresh index
    es.indices.refresh(index=cliargs['index'])
    # search es and start scroll
    res = es.search(index=cliargs['index'], scroll='1m', doc_type='file', size=config['es_scrollsize'],
                    body=data, request_timeout=config['es_timeout'])

    filehashlist = []
    filehashcount = 0
    while res['hits']['hits'] and len(res['hits']['hits']) > 0:
        for hit in res['hits']['hits']:
            filehash = hit['_source']['filehash']
            if filehash not in filehashlist:
                filehashlist.append(filehash)
                filehashcount += 1
                filehashlist_len = len(filehashlist)
                if filehashlist_len >= batchsize:
                    # send to rq for bots to process file hashkey list
                    q.enqueue(dupes_process_hashkey, args=(filehashlist, cliargs,), result_ttl=config['redis_ttl'])
                    if cliargs['debug'] or cliargs['verbose']:
                        logger.info("enqueued batchsize: %s (batchsize: %s)" % (filehashlist_len, batchsize))
                    del filehashlist[:]
                    if cliargs['adaptivebatch']:
                        batchsize = adaptive_batch(q, cliargs, batchsize)
                        if cliargs['debug'] or cliargs['verbose']:
                            logger.info("batchsize set to: %s" % batchsize)

        # use es scroll api
        res = es.scroll(scroll_id=res['_scroll_id'], scroll='1m',
                        request_timeout=config['es_timeout'])

    # enqueue dir calc job for any remaining in dirlist
    if len(filehashlist) > 0:
        q.enqueue(dupes_process_hashkey, args=(filehashlist, cliargs,), result_ttl=config['redis_ttl'])

    logger.info('%s file hashes have been enqueued' % filehashcount)

    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        bar = progress_bar('Checking')
        bar.start()
    else:
        bar = None

    # update progress bar until bots are idle and queue is empty
    while worker_bots_busy([q]):
        if bar:
            q_len = len(q)
            try:
                bar.update(q_len)
            except (ZeroDivisionError, ValueError):
                bar.update(0)
        time.sleep(1)

    if bar:
        bar.finish()