def scrape_tree_meta(paths, cliargs, reindex_dict):
    try:
        global worker
        tree_dirs = []
        tree_files = []
        totalcrawltime = 0
        statsembeded = False
        num_workers = len(SimpleWorker.all(connection=redis_conn))

        path_count = 0
        filenames = []
        for path in paths:
            path_count += 1
            starttime = time.time()
            if not cliargs['dirsonly']:
                root, dirs, files = path
            else:
                root, dirs = path
                files = []
            if path_count == 1:
                if type(root) is tuple:
                    statsembeded = True
            # check if stats embeded in data from diskover tree walk client or crawlapi
            if statsembeded:
                root_path = root[0]
                dmeta = get_dir_meta(worker,
                                     root,
                                     cliargs,
                                     reindex_dict,
                                     statsembeded=True)
            else:
                root_path = root
                dmeta = get_dir_meta(worker,
                                     root_path,
                                     cliargs,
                                     reindex_dict,
                                     statsembeded=False)

            if dmeta:
                # no files in batch, get them with scandir
                if cliargs['dirsonly']:
                    for entry in scandir(root):
                        if entry.is_file(
                                follow_symlinks=False) and not file_excluded(
                                    entry.name):
                            files.append(entry.name)
                filecount = 0
                # check if the directory has a ton of files in it and farm out meta collection to other worker bots
                files_count = len(files)
                if cliargs['splitfiles'] and files_count >= cliargs[
                        'splitfilesnum']:
                    fmetas = []
                    for filelist in split_list(files,
                                               int(files_count / num_workers)):
                        fmetas.append(
                            q_crawl.enqueue(file_meta_collector,
                                            args=(
                                                filelist,
                                                root_path,
                                                statsembeded,
                                                cliargs,
                                                reindex_dict,
                                            ),
                                            result_ttl=config['redis_ttl']))
                    n = 0
                    while n < len(fmetas):
                        if fmetas[n].result:
                            for fmeta in fmetas[n].result:
                                if fmeta:
                                    tree_files.append(fmeta)
                                    filecount += 1
                            n += 1
                    del fmetas[:]
                else:
                    for file in files:
                        filenames.append(file[0])
                        if statsembeded:
                            fmeta = get_file_meta(worker,
                                                  file,
                                                  cliargs,
                                                  reindex_dict,
                                                  statsembeded=True)
                        else:
                            fmeta = get_file_meta(worker,
                                                  os.path.join(
                                                      root_path, file),
                                                  cliargs,
                                                  reindex_dict,
                                                  statsembeded=False)
                        if fmeta:
                            tree_files.append(fmeta)
                            filecount += 1

                # update crawl time
                elapsed = time.time() - starttime
                dmeta['crawl_time'] = round(elapsed, 6)
                # check for empty dirs and dirsonly cli arg
                if cliargs['indexemptydirs']:
                    tree_dirs.append(dmeta)
                elif not cliargs['indexemptydirs'] and (len(dirs) > 0
                                                        or filecount > 0):
                    tree_dirs.append(dmeta)
                totalcrawltime += elapsed

            # check if doc count is more than es chunksize and bulk add to es
            if len(tree_dirs) + len(tree_files) >= config['es_chunksize']:
                es_bulk_add(worker, tree_dirs, tree_files, cliargs,
                            totalcrawltime)
                del tree_dirs[:]
                del tree_files[:]
                totalcrawltime = 0

        # bulk add to es
        if len(tree_dirs) > 0 or len(tree_files) > 0:
            es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime)

        print('%s | processed %d files' % (datetime.now(), len(filenames)))
        return True, filenames
    except Exception as e:
        print('%s | error | %s' % (datetime.now(), e))
        return False, []
Ejemplo n.º 2
0
def dupes_finder(es, q, cliargs, logger):
    """This is the duplicate file finder function.
    It searches Elasticsearch for files that have the same filehashes
    and adds file hash groups to Queue.
    """

    logger.info('Searching %s for duplicate file hashes...', cliargs['index'])

    # find the filehashes with largest files and add filehash keys
    # to hashgroups
    data = {
        "size": 0,
        "query": {
            "bool": {
                "must": {
                    "term": {
                        "hardlinks": 1
                    }
                },
                "filter": {
                    "range": {
                        "filesize": {
                            "lte": config['dupes_maxsize'],
                            "gte": cliargs['minsize']
                        }
                    }
                }
            }
        },
        "aggs": {
            "dupe_filehash": {
                "terms": {
                    "field": "filehash",
                    "min_doc_count": 2,
                    "size": 10000,
                    "order": {
                        "max_file_size": "desc"
                    }
                },
                "aggs": {
                    "max_file_size": {
                        "max": {
                            "field": "filesize"
                        }
                    }
                }
            }
        }
    }

    # refresh index
    es.indices.refresh(index=cliargs['index'])
    res = es.search(index=cliargs['index'],
                    doc_type='file',
                    body=data,
                    request_timeout=config['es_timeout'])

    logger.info('Found %s duplicate file hashes, enqueueing...',
                len(res['aggregations']['dupe_filehash']['buckets']))

    # add hash keys to Queue
    for bucket in res['aggregations']['dupe_filehash']['buckets']:
        q.enqueue(dupes_process_hashkey,
                  args=(
                      bucket['key'],
                      cliargs,
                  ),
                  result_ttl=config['redis_ttl'])

    logger.info('All file hashes have been enqueued')

    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        bar = progress_bar('Checking')
        bar.start()
    else:
        bar = None

    # wait for queue to be empty and update progress bar
    time.sleep(1)
    while True:
        workers_busy = False
        workers = SimpleWorker.all(connection=redis_conn)
        for worker in workers:
            if worker._state == "busy":
                workers_busy = True
                break
        q_len = len(q)
        if not cliargs['quiet'] and not cliargs['debug'] and not cliargs[
                'verbose']:
            try:
                bar.update(q_len)
            except ZeroDivisionError:
                bar.update(0)
            except ValueError:
                bar.update(0)
        if q_len == 0 and workers_busy == False:
            break
        time.sleep(.5)

    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        bar.finish()
Ejemplo n.º 3
0
 def get_report_metrics(self):
     conn = self.connection
     worker_count = len(SimpleWorker.all(connection=conn))
     fq = get_failed_queue(connection=conn)
     return {"queue.failed": len(fq), "queue.workers": worker_count}
Ejemplo n.º 4
0
def scrape_tree_meta(paths, cliargs, reindex_dict):
    global worker
    tree_dirs = []
    tree_files = []
    totalcrawltime = 0
    num_workers = len(SimpleWorker.all(connection=redis_conn))

    for path in paths:
        starttime = time.time()
        root, dirs, files = path

        # check if dirchunk or stats embeded in data from
        # diskover tree walk client or crawlapi
        if type(root) is tuple:
            if root[1] == 'dchunk':
                dirchunk = True
                statsembeded = False
            else:
                statsembeded = True
                dirchunk = False
        else:
            statsembeded = False
            dirchunk = False

        if statsembeded:
            root_path = root[0]
            dmeta = get_dir_meta(worker,
                                 root,
                                 cliargs,
                                 reindex_dict,
                                 statsembeded=True)
        else:
            if dirchunk:
                root_path = root[0]
                dmeta = {'chunkpath': root_path}
            else:
                root_path = root
                dmeta = get_dir_meta(worker,
                                     root_path,
                                     cliargs,
                                     reindex_dict,
                                     statsembeded=False)

        if dmeta:
            filecount = 0
            # check if the directory has a ton of files in it and farm out meta collection to other worker bots
            files_count = len(files)
            if cliargs[
                    'splitfiles'] and files_count >= cliargs['splitfilesnum']:
                fmetas = []
                for filelist in split_list(files,
                                           int(files_count / num_workers)):
                    fmetas.append(
                        q_crawl.enqueue(file_meta_collector,
                                        args=(
                                            filelist,
                                            root_path,
                                            statsembeded,
                                            cliargs,
                                            reindex_dict,
                                        ),
                                        result_ttl=config['redis_ttl']))
                n = 0
                while n < len(fmetas):
                    if fmetas[n].result:
                        for fmeta in fmetas[n].result:
                            if fmeta:
                                tree_files.append(fmeta)
                                filecount += 1
                        n += 1
                    else:
                        time.sleep(.05)
                del fmetas[:]
            else:
                for file in files:
                    if statsembeded:
                        fmeta = get_file_meta(worker,
                                              file,
                                              cliargs,
                                              reindex_dict,
                                              statsembeded=True)
                    else:
                        fmeta = get_file_meta(worker,
                                              os.path.join(root_path, file),
                                              cliargs,
                                              reindex_dict,
                                              statsembeded=False)
                    if fmeta:
                        tree_files.append(fmeta)
                        filecount += 1

            # update crawl time
            elapsed = time.time() - starttime
            dmeta['crawl_time'] = round(elapsed, 6)
            # check for empty dirs
            if cliargs['indexemptydirs']:
                tree_dirs.append(dmeta)
            elif not cliargs['indexemptydirs'] and (len(dirs) > 0
                                                    or filecount > 0):
                tree_dirs.append(dmeta)
            totalcrawltime += elapsed

        # check if doc count is more than es chunksize and bulk add to es
        if len(tree_dirs) + len(tree_files) >= config['es_chunksize']:
            es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime)
            del tree_dirs[:]
            del tree_files[:]
            totalcrawltime = 0

    # bulk add to es
    if len(tree_dirs) > 0 or len(tree_files) > 0:
        es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime)
Ejemplo n.º 5
0
def qumulo_treewalk(path, ip, ses, q_crawl, num_sep, level, batchsize, cliargs,
                    logger, reindex_dict):
    batch = []
    dircount = 0
    totaldirs = 0
    starttime = time.time()

    # queue for paths
    q_paths = PyQueue()
    q_paths_results = PyQueue()
    lock = Lock()

    # set up threads for tree walk
    for i in range(cliargs['walkthreads']):
        t = Thread(target=apiwalk_worker,
                   args=(
                       ip,
                       ses,
                       q_paths,
                       q_paths_results,
                       lock,
                   ))
        t.daemon = True
        t.start()

    # set up progress bar
    if not cliargs['quiet'] and not cliargs['debug'] and not cliargs['verbose']:
        widgets = [
            progressbar.AnimatedMarker(), ' Crawling (Queue: ',
            progressbar.Counter(),
            progressbar.FormatLabel(''), ') ',
            progressbar.Timer()
        ]

        bar = progressbar.ProgressBar(widgets=widgets,
                                      max_value=progressbar.UnknownLength)
        bar.start()
    else:
        bar = None

    bartimestamp = time.time()
    for root, dirs, files in qumulo_api_walk(path, ip, ses, q_paths,
                                             q_paths_results):
        dircount += 1
        totaldirs += 1
        if len(dirs) == 0 and len(
                files) == 0 and not cliargs['indexemptydirs']:
            continue
        if root['path'] != '/':
            root_path = root['path'].rstrip(os.path.sep)
        else:
            root_path = root['path']
        if not dir_excluded(root_path, config, cliargs):
            batch.append((root, files))
            batch_len = len(batch)
            if batch_len >= batchsize:
                q_crawl.enqueue(scrape_tree_meta,
                                args=(
                                    batch,
                                    cliargs,
                                    reindex_dict,
                                ),
                                result_ttl=config['redis_ttl'])
                del batch[:]
                if cliargs['adaptivebatch']:
                    batchsize = adaptive_batch(q_crawl, cliargs, batchsize)

            # check if at maxdepth level and delete dirs/files lists to not
            # descend further down the tree
            if cliargs['maxdepth']:
                num_sep_this = root_path.count(os.path.sep)
                if num_sep + level <= num_sep_this:
                    del dirs[:]
                    del files[:]

        else:  # directory excluded
            del dirs[:]
            del files[:]

        # update progress bar
        if bar:
            try:
                if time.time() - bartimestamp >= 2:
                    elapsed = round(time.time() - bartimestamp, 3)
                    dirspersec = round(dircount / elapsed, 3)
                    widgets[4] = progressbar.FormatLabel(', ' +
                                                         str(dirspersec) +
                                                         ' dirs/sec) ')
                    bartimestamp = time.time()
                    dircount = 0
                bar.update(len(q_crawl))
            except ZeroDivisionError:
                bar.update(0)
            except ValueError:
                bar.update(0)

    # add any remaining in batch to queue
    q_crawl.enqueue(scrape_tree_meta,
                    args=(
                        batch,
                        cliargs,
                        reindex_dict,
                    ),
                    result_ttl=config['redis_ttl'])

    # set up progress bar with time remaining
    if bar:
        bar.finish()
        bar_max_val = len(q_crawl)
        bar = progressbar.ProgressBar(max_value=bar_max_val)
        bar.start()
    else:
        bar = None

    # wait for queue to be empty and update progress bar
    while True:
        workers_busy = False
        workers = SimpleWorker.all(connection=redis_conn)
        for worker in workers:
            if worker._state == "busy":
                workers_busy = True
                break
        q_len = len(q_crawl)
        if bar:
            try:
                bar.update(bar_max_val - q_len)
            except ZeroDivisionError:
                bar.update(0)
            except ValueError:
                bar.update(0)
        if q_len == 0 and workers_busy == False:
            break
        time.sleep(.5)

    if bar:
        bar.finish()

    elapsed = round(time.time() - starttime, 3)
    dirspersec = round(totaldirs / elapsed, 3)

    logger.info(
        "Finished crawling, elapsed time %s sec, dirs walked %s (%s dirs/sec)"
        % (elapsed, totaldirs, dirspersec))