Exemple #1
0
def qumulo_treewalk(path, lock, ip, ses, num_sep, level, totaljobs, batchsize,
                    cliargs, logger, reindex_dict):
    batch = []

    for root, dirs, files in qumulo_api_walk(path, ip, ses):
        if root['path'] != '/':
            root_path = root['path'].rstrip(os.path.sep)
        else:
            root_path = root['path']
        if not diskover.dir_excluded(root_path, diskover.config,
                                     cliargs['verbose']):
            if len(dirs) == 0 and len(
                    files) == 0 and not cliargs['indexemptydirs']:
                continue

            batch.append((root, files))
            if len(batch) >= batchsize:
                diskover.q.enqueue(diskover_worker_bot.scrape_tree_meta,
                                   args=(
                                       batch,
                                       cliargs,
                                       reindex_dict,
                                   ))
                with lock:
                    totaljobs.value += 1
                del batch[:]
                batchsize_prev = batchsize
                if cliargs['adaptivebatch']:
                    if len(diskover.q) == 0:
                        if (batchsize -
                                10) >= diskover.adaptivebatch_startsize:
                            batchsize = batchsize - 10
                    elif len(diskover.q) > 0:
                        if (batchsize + 10) <= diskover.adaptivebatch_maxsize:
                            batchsize = batchsize + 10
                    cliargs['batchsize'] = batchsize
                    if cliargs['verbose'] or cliargs['debug']:
                        if batchsize_prev != batchsize:
                            logger.info('Batch size: %s' % batchsize)

            # check if at maxdepth level and delete dirs/files lists to not
            # descend further down the tree
            num_sep_this = root_path.count(os.path.sep)
            if num_sep + level <= num_sep_this:
                del dirs[:]
                del files[:]

        else:  # directory excluded
            del dirs[:]
            del files[:]

    # add any remaining in batch to queue
    diskover.q.enqueue(diskover_worker_bot.scrape_tree_meta,
                       args=(
                           batch,
                           cliargs,
                           reindex_dict,
                       ))
    with lock:
        totaljobs.value += 1
def qumulo_treewalk(path, ip, ses, num_sep, level, batchsize, bar, cliargs, reindex_dict):
    batch = []

    for root, dirs, files in qumulo_api_walk(path, ip, ses):
        if root['path'] != '/':
            root_path = root['path'].rstrip(os.path.sep)
        else:
            root_path = root['path']
        if not diskover.dir_excluded(root_path, diskover.config, cliargs['verbose']):
            if len(dirs) == 0 and len(files) == 0 and not cliargs['indexemptydirs']:
                continue

            batch.append((root, files))
            if len(batch) >= batchsize:
                diskover.q.enqueue(diskover_worker_bot.scrape_tree_meta,
                          args=(batch, cliargs, reindex_dict,))
                diskover.totaljobs += 1
                del batch[:]
                batchsize_prev = batchsize
                if cliargs['adaptivebatch']:
                    if len(diskover.q) == 0:
                        if (batchsize - 10) >= diskover.adaptivebatch_startsize:
                            batchsize = batchsize - 10
                    elif len(diskover.q) > 0:
                        if (batchsize + 10) <= diskover.adaptivebatch_maxsize:
                            batchsize = batchsize + 10
                    cliargs['batchsize'] = batchsize
                    if cliargs['verbose'] or cliargs['debug']:
                        if batchsize_prev != batchsize:
                            diskover.logger.info('Batch size: %s' % batchsize)

            # check if at maxdepth level and delete dirs/files lists to not
            # descend further down the tree
            num_sep_this = root_path.count(os.path.sep)
            if num_sep + level <= num_sep_this:
                del dirs[:]
                del files[:]

        else:  # directory excluded
            del dirs[:]
            del files[:]

        if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
            try:
                percent = int("{0:.0f}".format(100 * ((diskover.totaljobs - len(diskover.q))
                                                      / float(diskover.totaljobs))))
                bar.update(percent)
            except ZeroDivisionError:
                bar.update(0)
            except ValueError:
                bar.update(0)

    # add any remaining in batch to queue
    diskover.q.enqueue(diskover_worker_bot.scrape_tree_meta,
              args=(batch, cliargs, reindex_dict,))
    diskover.totaljobs += 1
Exemple #3
0
def qumulo_treewalk(path, ip, ses, q_crawl, num_sep, level, batchsize, cliargs, reindex_dict, bar):
    batch = []

    for root, dirs, files in qumulo_api_walk(path, ip, ses):
        if len(dirs) == 0 and len(files) == 0 and not cliargs['indexemptydirs']:
            continue
        if root['path'] != '/':
            root_path = root['path'].rstrip(os.path.sep)
        else:
            root_path = root['path']
        if not diskover.dir_excluded(root_path, diskover.config, cliargs['verbose']):
            batch.append((root, files))
            if len(batch) >= batchsize:
                q_crawl.enqueue(diskover_worker_bot.scrape_tree_meta,
                          args=(batch, cliargs, reindex_dict,))
                del batch[:]
                batchsize_prev = batchsize
                if cliargs['adaptivebatch']:
                    q_len = len(q_crawl)
                    if q_len == 0:
                        if (batchsize - diskover.ab_step) >= diskover.ab_start:
                            batchsize = batchsize - diskover.ab_step
                    elif q_len > 0:
                        if (batchsize + diskover.ab_step) <= diskover.ab_max:
                            batchsize = batchsize + diskover.ab_step
                    cliargs['batchsize'] = batchsize
                    if cliargs['verbose'] or cliargs['debug']:
                        if batchsize_prev != batchsize:
                            diskover.logger.info('Batch size: %s' % batchsize)

            # check if at maxdepth level and delete dirs/files lists to not
            # descend further down the tree
            num_sep_this = root_path.count(os.path.sep)
            if num_sep + level <= num_sep_this:
                del dirs[:]
                del files[:]

        else:  # directory excluded
            del dirs[:]
            del files[:]

        # update progress bar
        if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
            try:
                bar.update(len(q_crawl))
            except ZeroDivisionError:
                bar.update(0)
            except ValueError:
                bar.update(0)

    # add any remaining in batch to queue
    q_crawl.enqueue(diskover_worker_bot.scrape_tree_meta, args=(batch, cliargs, reindex_dict,))
Exemple #4
0
def qumulo_api_walk(path, q_paths, q_paths_results, cliargs):
    q_paths.put(path)
    while True:
        entry = q_paths_results.get()
        root, dirs, nondirs = entry
        # yield before recursion
        yield root, dirs, nondirs
        # recurse into subdirectories
        for name in dirs:
            new_path = os.path.join(root, name)
            if not dir_excluded(new_path, config, cliargs):
                q_paths.put(new_path)
        q_paths_results.task_done()
        if q_paths_results.qsize() == 0 and q_paths.qsize() == 0:
            time.sleep(.5)
            if q_paths_results.qsize() == 0 and q_paths.qsize() == 0:
                break
Exemple #5
0
def qumulo_treewalk(path, ip, ses, q_crawl, num_sep, level, batchsize, cliargs, logger, reindex_dict):
    batch = []
    dircount = 0
    totaldirs = 0
    totalfiles = 0
    starttime = time.time()

    # queue for paths
    q_paths = PyQueue()
    q_paths_results = PyQueue()
    lock = Lock()

    # set up threads for tree walk
    for i in range(cliargs['walkthreads']):
        t = Thread(target=apiwalk_worker, args=(ip, ses, q_paths, q_paths_results, lock,))
        t.daemon = True
        t.start()

    # set up progress bar
    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        widgets = [progressbar.AnimatedMarker(), ' Crawling (Queue: ', progressbar.Counter(),
                   progressbar.FormatLabel(''), ') ', progressbar.Timer()]

        bar = progressbar.ProgressBar(widgets=widgets, max_value=progressbar.UnknownLength)
        bar.start()
    else:
        bar = None

    bartimestamp = time.time()
    for root, dirs, files in qumulo_api_walk(path, ip, ses, q_paths, q_paths_results):
        dircount += 1
        totaldirs += 1
        files_len = len(files)
        dirs_len = len(dirs)
        totalfiles += files_len
        if dirs_len == 0 and files_len == 0 and not cliargs['indexemptydirs']:
            continue
        if root['path'] != '/':
            root_path = root['path'].rstrip(os.path.sep)
        else:
            root_path = root['path']
        if not dir_excluded(root_path, config, cliargs):
            batch.append((root, dirs, files))
            batch_len = len(batch)
            if batch_len >= batchsize or (cliargs['adaptivebatch'] and totalfiles >= config['adaptivebatch_maxfiles']):
                q_crawl.enqueue(scrape_tree_meta, args=(batch, cliargs, reindex_dict,),
                                      result_ttl=config['redis_ttl'])
                if cliargs['debug'] or cliargs['verbose']:
                    logger.info("enqueued batchsize: %s (batchsize: %s)" % (batch_len, batchsize))
                del batch[:]
                if cliargs['adaptivebatch']:
                    batchsize = adaptive_batch(q_crawl, cliargs, batchsize)
                    if cliargs['debug'] or cliargs['verbose']:
                        logger.info("batchsize set to: %s" % batchsize)

            # check if at maxdepth level and delete dirs/files lists to not
            # descend further down the tree
            if cliargs['maxdepth']:
                num_sep_this = root_path.count(os.path.sep)
                if num_sep + level <= num_sep_this:
                    del dirs[:]
                    del files[:]

        else:  # directory excluded
            del dirs[:]
            del files[:]

        # update progress bar
        if bar:
            try:
                if time.time() - bartimestamp >= 2:
                    elapsed = round(time.time() - bartimestamp, 3)
                    dirspersec = round(dircount / elapsed, 3)
                    widgets[4] = progressbar.FormatLabel(', ' + str(dirspersec) + ' dirs/sec) ')
                    bartimestamp = time.time()
                    dircount = 0
                bar.update(len(q_crawl))
            except (ZeroDivisionError, ValueError):
                bar.update(0)

    # add any remaining in batch to queue
    q_crawl.enqueue(scrape_tree_meta, args=(batch, cliargs, reindex_dict,), result_ttl=config['redis_ttl'])

    # set up progress bar with time remaining
    if bar:
        bar.finish()
        bar_max_val = len(q_crawl)
        bar = progressbar.ProgressBar(max_value=bar_max_val)
        bar.start()
    else:
        bar = None

    # update progress bar until bots are idle and queue is empty
    while worker_bots_busy([q_crawl]):
        if bar:
            q_len = len(q_crawl)
            try:
                bar.update(bar_max_val - q_len)
            except (ZeroDivisionError, ValueError):
                bar.update(0)
        time.sleep(1)

    if bar:
        bar.finish()

    elapsed = round(time.time() - starttime, 3)
    dirspersec = round(totaldirs / elapsed, 3)

    logger.info("Finished crawling, elapsed time %s sec, dirs walked %s (%s dirs/sec)" %
                (elapsed, totaldirs, dirspersec))
def socket_thread_handler_twc(threadnum, q, q_kill, rootdir, num_sep, level,
                              batchsize, cliargs, logger, reindex_dict):
    """This is the socket thread handler tree walk client function.
    Stream of directory listings (pickle) from diskover treewalk
    client connections are enqueued to redis rq queue.
    """

    while True:

        try:

            c = q.get()
            clientsock, addr = c
            logger.debug(clientsock)
            logger.debug(addr)

            while True:
                data = recv_one_message(clientsock)
                #logger.debug(data)

                if not data:
                    break

                if data == b'SIGKILL' or data == 'SIGKILL':
                    q_kill.put(b'SIGKILL')
                    break

                data_decoded = pickle.loads(data)
                logger.debug(data_decoded)

                # enqueue to redis
                batch = []
                for root, dirs, files in data_decoded:
                    if len(dirs) == 0 and len(
                            files) == 0 and not cliargs['indexemptydirs']:
                        continue
                    # check if meta stat data has been embeded in the data from client
                    if type(root) is tuple:
                        rootpath = root[0]
                    else:
                        rootpath = root
                    if not dir_excluded(rootpath, config, cliargs['verbose']):
                        batch.append((root, dirs, files))
                        batch_len = len(batch)
                        if batch_len >= batchsize:
                            q_crawl.enqueue(scrape_tree_meta,
                                            args=(
                                                batch,
                                                cliargs,
                                                reindex_dict,
                                            ))
                            del batch[:]
                            if cliargs['adaptivebatch']:
                                batchsize = adaptive_batch(
                                    q_crawl, cliargs, batchsize)

                        # check if at maxdepth level and delete dirs/files lists to not
                        # descend further down the tree
                        num_sep_this = rootpath.count(os.path.sep)
                        if num_sep + level <= num_sep_this:
                            del dirs[:]
                            del files[:]

                    else:  # directory excluded
                        del dirs[:]
                        del files[:]

                if len(batch) > 0:
                    # add any remaining in batch to queue
                    q_crawl.enqueue(scrape_tree_meta,
                                    args=(
                                        batch,
                                        cliargs,
                                        reindex_dict,
                                    ))
                    del batch[:]

            # close connection to client
            clientsock.close()
            logger.info("[thread-%s]: %s closed connection" %
                        (threadnum, str(addr)))
            q.task_done()

        except socket.error as e:
            logger.error("[thread-%s]: Socket error (%s)" % (threadnum, e))