Example #1
0
def qumulo_treewalk(path, ip, ses, q_crawl, num_sep, level, batchsize, cliargs, logger, reindex_dict):
    batch = []
    dircount = 0
    totaldirs = 0
    totalfiles = 0
    starttime = time.time()

    # queue for paths
    q_paths = PyQueue()
    q_paths_results = PyQueue()
    lock = Lock()

    # set up threads for tree walk
    for i in range(cliargs['walkthreads']):
        t = Thread(target=apiwalk_worker, args=(ip, ses, q_paths, q_paths_results, lock,))
        t.daemon = True
        t.start()

    # set up progress bar
    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        widgets = [progressbar.AnimatedMarker(), ' Crawling (Queue: ', progressbar.Counter(),
                   progressbar.FormatLabel(''), ') ', progressbar.Timer()]

        bar = progressbar.ProgressBar(widgets=widgets, max_value=progressbar.UnknownLength)
        bar.start()
    else:
        bar = None

    bartimestamp = time.time()
    for root, dirs, files in qumulo_api_walk(path, ip, ses, q_paths, q_paths_results):
        dircount += 1
        totaldirs += 1
        files_len = len(files)
        dirs_len = len(dirs)
        totalfiles += files_len
        if dirs_len == 0 and files_len == 0 and not cliargs['indexemptydirs']:
            continue
        if root['path'] != '/':
            root_path = root['path'].rstrip(os.path.sep)
        else:
            root_path = root['path']
        if not dir_excluded(root_path, config, cliargs):
            batch.append((root, dirs, files))
            batch_len = len(batch)
            if batch_len >= batchsize or (cliargs['adaptivebatch'] and totalfiles >= config['adaptivebatch_maxfiles']):
                q_crawl.enqueue(scrape_tree_meta, args=(batch, cliargs, reindex_dict,),
                                      result_ttl=config['redis_ttl'])
                if cliargs['debug'] or cliargs['verbose']:
                    logger.info("enqueued batchsize: %s (batchsize: %s)" % (batch_len, batchsize))
                del batch[:]
                if cliargs['adaptivebatch']:
                    batchsize = adaptive_batch(q_crawl, cliargs, batchsize)
                    if cliargs['debug'] or cliargs['verbose']:
                        logger.info("batchsize set to: %s" % batchsize)

            # check if at maxdepth level and delete dirs/files lists to not
            # descend further down the tree
            if cliargs['maxdepth']:
                num_sep_this = root_path.count(os.path.sep)
                if num_sep + level <= num_sep_this:
                    del dirs[:]
                    del files[:]

        else:  # directory excluded
            del dirs[:]
            del files[:]

        # update progress bar
        if bar:
            try:
                if time.time() - bartimestamp >= 2:
                    elapsed = round(time.time() - bartimestamp, 3)
                    dirspersec = round(dircount / elapsed, 3)
                    widgets[4] = progressbar.FormatLabel(', ' + str(dirspersec) + ' dirs/sec) ')
                    bartimestamp = time.time()
                    dircount = 0
                bar.update(len(q_crawl))
            except (ZeroDivisionError, ValueError):
                bar.update(0)

    # add any remaining in batch to queue
    q_crawl.enqueue(scrape_tree_meta, args=(batch, cliargs, reindex_dict,), result_ttl=config['redis_ttl'])

    # set up progress bar with time remaining
    if bar:
        bar.finish()
        bar_max_val = len(q_crawl)
        bar = progressbar.ProgressBar(max_value=bar_max_val)
        bar.start()
    else:
        bar = None

    # update progress bar until bots are idle and queue is empty
    while worker_bots_busy([q_crawl]):
        if bar:
            q_len = len(q_crawl)
            try:
                bar.update(bar_max_val - q_len)
            except (ZeroDivisionError, ValueError):
                bar.update(0)
        time.sleep(1)

    if bar:
        bar.finish()

    elapsed = round(time.time() - starttime, 3)
    dirspersec = round(totaldirs / elapsed, 3)

    logger.info("Finished crawling, elapsed time %s sec, dirs walked %s (%s dirs/sec)" %
                (elapsed, totaldirs, dirspersec))
Example #2
0
def socket_thread_handler_twc(threadnum, q, q_kill, lock, rootdir, num_sep,
                              level, batchsize, cliargs, logger, reindex_dict):
    """This is the socket thread handler tree walk client function.
    Stream of directory listings (pickle) from diskover treewalk
    client connections are enqueued to redis rq queue.
    """

    while True:
        try:
            c = q.get()
            clientsock, addr = c
            logger.debug(clientsock)
            logger.debug(addr)

            totalfiles = 0
            while True:
                data = recv_one_message(clientsock)
                if not data:
                    break
                if data == b'SIGKILL' or data == 'SIGKILL':
                    q_kill.put(b'SIGKILL')
                    break

                # unpickle data sent from client
                data_decoded = pickle.loads(data)
                logger.debug(data_decoded)

                # enqueue to redis
                batch = []
                for root, dirs, files in data_decoded:
                    files_len = len(files)
                    totalfiles += files_len
                    # check for empty dirs
                    if len(dirs) == 0 and len(
                            files) == 0 and not cliargs['indexemptydirs']:
                        continue
                    batch.append((root, dirs, files))
                    batch_len = len(batch)
                    if batch_len >= batchsize or (
                            cliargs['adaptivebatch'] and
                            totalfiles >= config['adaptivebatch_maxfiles']):
                        q_crawl.enqueue(scrape_tree_meta,
                                        args=(
                                            batch,
                                            cliargs,
                                            reindex_dict,
                                        ),
                                        result_ttl=config['redis_ttl'])
                        if cliargs['debug'] or cliargs['verbose']:
                            logger.info(
                                "enqueued batchsize: %s (batchsize: %s)" %
                                (batch_len, batchsize))
                        del batch[:]
                        totalfiles = 0
                        if cliargs['adaptivebatch']:
                            batchsize = adaptive_batch(q_crawl, cliargs,
                                                       batchsize)
                            if cliargs['debug'] or cliargs['verbose']:
                                logger.info("batchsize set to: %s" % batchsize)

                if len(batch) > 0:
                    # add any remaining in batch to queue
                    q_crawl.enqueue(scrape_tree_meta,
                                    args=(
                                        batch,
                                        cliargs,
                                        reindex_dict,
                                    ),
                                    result_ttl=config['redis_ttl'])
                    del batch[:]

            # close connection to client
            clientsock.close()
            logger.info("[thread-%s]: %s closed connection" %
                        (threadnum, str(addr)))
            q.task_done()

        except socket.error as e:
            logger.error("[thread-%s]: Socket error (%s)" % (threadnum, e))
def socket_thread_handler_twc(threadnum, q, q_kill, rootdir, num_sep, level,
                              batchsize, cliargs, logger, reindex_dict):
    """This is the socket thread handler tree walk client function.
    Stream of directory listings (pickle) from diskover treewalk
    client connections are enqueued to redis rq queue.
    """

    while True:

        try:

            c = q.get()
            clientsock, addr = c
            logger.debug(clientsock)
            logger.debug(addr)

            while True:
                data = recv_one_message(clientsock)
                #logger.debug(data)

                if not data:
                    break

                if data == b'SIGKILL' or data == 'SIGKILL':
                    q_kill.put(b'SIGKILL')
                    break

                data_decoded = pickle.loads(data)
                logger.debug(data_decoded)

                # enqueue to redis
                batch = []
                for root, dirs, files in data_decoded:
                    if len(dirs) == 0 and len(
                            files) == 0 and not cliargs['indexemptydirs']:
                        continue
                    # check if meta stat data has been embeded in the data from client
                    if type(root) is tuple:
                        rootpath = root[0]
                    else:
                        rootpath = root
                    if not dir_excluded(rootpath, config, cliargs['verbose']):
                        batch.append((root, dirs, files))
                        batch_len = len(batch)
                        if batch_len >= batchsize:
                            q_crawl.enqueue(scrape_tree_meta,
                                            args=(
                                                batch,
                                                cliargs,
                                                reindex_dict,
                                            ))
                            del batch[:]
                            if cliargs['adaptivebatch']:
                                batchsize = adaptive_batch(
                                    q_crawl, cliargs, batchsize)

                        # check if at maxdepth level and delete dirs/files lists to not
                        # descend further down the tree
                        num_sep_this = rootpath.count(os.path.sep)
                        if num_sep + level <= num_sep_this:
                            del dirs[:]
                            del files[:]

                    else:  # directory excluded
                        del dirs[:]
                        del files[:]

                if len(batch) > 0:
                    # add any remaining in batch to queue
                    q_crawl.enqueue(scrape_tree_meta,
                                    args=(
                                        batch,
                                        cliargs,
                                        reindex_dict,
                                    ))
                    del batch[:]

            # close connection to client
            clientsock.close()
            logger.info("[thread-%s]: %s closed connection" %
                        (threadnum, str(addr)))
            q.task_done()

        except socket.error as e:
            logger.error("[thread-%s]: Socket error (%s)" % (threadnum, e))
Example #4
0
def dupes_finder(es, q, cliargs, logger):
    """This is the duplicate file finder function.
    It searches Elasticsearch for files that have the same filehashes
    and adds file hash groups to Queue.
    """

    logger.info('Searching %s for all dupe filehashes...', cliargs['index'])

    # first get all the filehashes with files that have a hardlinks count of 1
    if cliargs['inchardlinks']:
        data = {
            "size":
            0,
            "_source": [
                'filename', 'filehash', 'path_parent', 'last_modified',
                'last_access'
            ],
            "query": {
                "bool": {
                    "must": {
                        "range": {
                            "filesize": {
                                "lte": config['dupes_maxsize'],
                                "gte": cliargs['minsize']
                            }
                        }
                    }
                }
            }
        }
    else:
        data = {
            "size":
            0,
            "_source": [
                'filename', 'filehash', 'path_parent', 'last_modified',
                'last_access'
            ],
            "query": {
                "bool": {
                    "must": {
                        "term": {
                            "hardlinks": 1
                        }
                    },
                    "filter": {
                        "range": {
                            "filesize": {
                                "lte": config['dupes_maxsize'],
                                "gte": cliargs['minsize']
                            }
                        }
                    }
                }
            }
        }

    # refresh index
    es.indices.refresh(index=cliargs['index'])
    # search es and start scroll
    res = es.search(index=cliargs['index'],
                    doc_type='file',
                    scroll='1m',
                    size=config['es_scrollsize'],
                    body=data,
                    request_timeout=config['es_timeout'])

    filehashes = {}
    while res['hits']['hits'] and len(res['hits']['hits']) > 0:
        for hit in res['hits']['hits']:
            filehash = hit['_source']['filehash']
            filepath = os.path.join(hit['_source']['path_parent'],
                                    hit['_source']['filename'])
            if filehash in filehashes:
                filehashes[filehash].append({
                    'id':
                    hit['_id'],
                    'filename':
                    filepath,
                    'atime':
                    hit['_source']['last_access'],
                    'mtime':
                    hit['_source']['last_modified'],
                    'md5':
                    ''
                })
            else:
                filehashes[filehash] = [{
                    'id':
                    hit['_id'],
                    'filename':
                    filepath,
                    'atime':
                    hit['_source']['last_access'],
                    'mtime':
                    hit['_source']['last_modified'],
                    'md5':
                    ''
                }]

        # use es scroll api
        res = es.scroll(scroll_id=res['_scroll_id'],
                        scroll='1m',
                        request_timeout=config['es_timeout'])

    possibledupescount = 0
    for key, value in list(filehashes.items()):
        filehash_filecount = len(value)
        if filehash_filecount < 2:
            del filehashes[key]
        else:
            possibledupescount += filehash_filecount

    logger.info('Found %s possible dupe files', possibledupescount)
    if possibledupescount == 0:
        return

    logger.info('Starting to enqueue dupe file hashes...')

    if cliargs['adaptivebatch']:
        batchsize = ab_start
    else:
        batchsize = cliargs['batchsize']
    if cliargs['verbose'] or cliargs['debug']:
        logger.info('Batch size: %s' % batchsize)

    n = 0
    hashgroups = []
    for key, value in filehashes.items():
        if cliargs['verbose'] or cliargs['debug']:
            logger.info('filehash: %s, filecount: %s' % (key, len(value)))
        hashgroups.append({'filehash': key, 'files': value})
        n += 1
        if n >= batchsize:
            # send to rq for bots to process hashgroups list
            q.enqueue(dupes_process_hashkeys,
                      args=(
                          hashgroups,
                          cliargs,
                      ),
                      result_ttl=config['redis_ttl'])
            if cliargs['debug'] or cliargs['verbose']:
                logger.info("enqueued batchsize: %s (batchsize: %s)" %
                            (n, batchsize))
            del hashgroups[:]
            n = 0
            if cliargs['adaptivebatch']:
                batchsize = adaptive_batch(q, cliargs, batchsize)
                if cliargs['debug'] or cliargs['verbose']:
                    logger.info("batchsize set to: %s" % batchsize)

    # enqueue dir calc job for any remaining in dirlist
    if n > 0:
        q.enqueue(dupes_process_hashkeys,
                  args=(
                      hashgroups,
                      cliargs,
                  ),
                  result_ttl=config['redis_ttl'])

    logger.info(
        '%s possible dupe file hashes have been enqueued, worker bots processing dupes...'
        % possibledupescount)

    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        bar = progress_bar('Checking')
        bar.start()
    else:
        bar = None

    # update progress bar until bots are idle and queue is empty
    while worker_bots_busy([q]):
        if bar:
            q_len = len(q)
            try:
                bar.update(q_len)
            except (ZeroDivisionError, ValueError):
                bar.update(0)
        time.sleep(1)

    if bar:
        bar.finish()
Example #5
0
def dupes_finder(es, q, cliargs, logger):
    """This is the duplicate file finder function.
    It searches Elasticsearch for files that have the same filehashes
    and adds file hash groups to Queue.
    """

    logger.info('Searching %s for all duplicate files...', cliargs['index'])

    if cliargs['adaptivebatch']:
        batchsize = ab_start
    else:
        batchsize = cliargs['batchsize']
    if cliargs['verbose'] or cliargs['debug']:
        logger.info('Batch size: %s' % batchsize)

    # first get all the filehashes with files that have a hardlinks count of 1
    data = {
        "size": 0,
        "query": {
            "bool": {
                "must": {
                    "term": {"hardlinks": 1}
                },
                "filter": {
                    "range": {
                        "filesize": {
                            "lte": config['dupes_maxsize'],
                            "gte": cliargs['minsize']
                        }
                    }
                }
            }
        }
    }

    # refresh index
    es.indices.refresh(index=cliargs['index'])
    # search es and start scroll
    res = es.search(index=cliargs['index'], scroll='1m', doc_type='file', size=config['es_scrollsize'],
                    body=data, request_timeout=config['es_timeout'])

    filehashlist = []
    filehashcount = 0
    while res['hits']['hits'] and len(res['hits']['hits']) > 0:
        for hit in res['hits']['hits']:
            filehash = hit['_source']['filehash']
            if filehash not in filehashlist:
                filehashlist.append(filehash)
                filehashcount += 1
                filehashlist_len = len(filehashlist)
                if filehashlist_len >= batchsize:
                    # send to rq for bots to process file hashkey list
                    q.enqueue(dupes_process_hashkey, args=(filehashlist, cliargs,), result_ttl=config['redis_ttl'])
                    if cliargs['debug'] or cliargs['verbose']:
                        logger.info("enqueued batchsize: %s (batchsize: %s)" % (filehashlist_len, batchsize))
                    del filehashlist[:]
                    if cliargs['adaptivebatch']:
                        batchsize = adaptive_batch(q, cliargs, batchsize)
                        if cliargs['debug'] or cliargs['verbose']:
                            logger.info("batchsize set to: %s" % batchsize)

        # use es scroll api
        res = es.scroll(scroll_id=res['_scroll_id'], scroll='1m',
                        request_timeout=config['es_timeout'])

    # enqueue dir calc job for any remaining in dirlist
    if len(filehashlist) > 0:
        q.enqueue(dupes_process_hashkey, args=(filehashlist, cliargs,), result_ttl=config['redis_ttl'])

    logger.info('%s file hashes have been enqueued' % filehashcount)

    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        bar = progress_bar('Checking')
        bar.start()
    else:
        bar = None

    # update progress bar until bots are idle and queue is empty
    while worker_bots_busy([q]):
        if bar:
            q_len = len(q)
            try:
                bar.update(q_len)
            except (ZeroDivisionError, ValueError):
                bar.update(0)
        time.sleep(1)

    if bar:
        bar.finish()