Example #1
0
def socket_thread_handler_twc(threadnum, q, q_kill, lock, rootdir, num_sep,
                              level, batchsize, cliargs, logger, reindex_dict):
    """This is the socket thread handler tree walk client function.
    Stream of directory listings (pickle) from diskover treewalk
    client connections are enqueued to redis rq queue.
    """

    while True:
        try:
            c = q.get()
            clientsock, addr = c
            logger.debug(clientsock)
            logger.debug(addr)

            totalfiles = 0
            while True:
                data = recv_one_message(clientsock)
                if not data:
                    break
                if data == b'SIGKILL' or data == 'SIGKILL':
                    q_kill.put(b'SIGKILL')
                    break

                # unpickle data sent from client
                data_decoded = pickle.loads(data)
                logger.debug(data_decoded)

                # enqueue to redis
                batch = []
                for root, dirs, files in data_decoded:
                    files_len = len(files)
                    totalfiles += files_len
                    # check for empty dirs
                    if len(dirs) == 0 and len(
                            files) == 0 and not cliargs['indexemptydirs']:
                        continue
                    batch.append((root, dirs, files))
                    batch_len = len(batch)
                    if batch_len >= batchsize or (
                            cliargs['adaptivebatch'] and
                            totalfiles >= config['adaptivebatch_maxfiles']):
                        q_crawl.enqueue(scrape_tree_meta,
                                        args=(
                                            batch,
                                            cliargs,
                                            reindex_dict,
                                        ),
                                        result_ttl=config['redis_ttl'])
                        if cliargs['debug'] or cliargs['verbose']:
                            logger.info(
                                "enqueued batchsize: %s (batchsize: %s)" %
                                (batch_len, batchsize))
                        del batch[:]
                        totalfiles = 0
                        if cliargs['adaptivebatch']:
                            batchsize = adaptive_batch(q_crawl, cliargs,
                                                       batchsize)
                            if cliargs['debug'] or cliargs['verbose']:
                                logger.info("batchsize set to: %s" % batchsize)

                if len(batch) > 0:
                    # add any remaining in batch to queue
                    q_crawl.enqueue(scrape_tree_meta,
                                    args=(
                                        batch,
                                        cliargs,
                                        reindex_dict,
                                    ),
                                    result_ttl=config['redis_ttl'])
                    del batch[:]

            # close connection to client
            clientsock.close()
            logger.info("[thread-%s]: %s closed connection" %
                        (threadnum, str(addr)))
            q.task_done()

        except socket.error as e:
            logger.error("[thread-%s]: Socket error (%s)" % (threadnum, e))
def scrape_tree_meta(paths, cliargs, reindex_dict):
    try:
        global worker
        tree_dirs = []
        tree_files = []
        totalcrawltime = 0
        statsembeded = False
        num_workers = len(SimpleWorker.all(connection=redis_conn))

        path_count = 0
        filenames = []
        for path in paths:
            path_count += 1
            starttime = time.time()
            if not cliargs['dirsonly']:
                root, dirs, files = path
            else:
                root, dirs = path
                files = []
            if path_count == 1:
                if type(root) is tuple:
                    statsembeded = True
            # check if stats embeded in data from diskover tree walk client or crawlapi
            if statsembeded:
                root_path = root[0]
                dmeta = get_dir_meta(worker,
                                     root,
                                     cliargs,
                                     reindex_dict,
                                     statsembeded=True)
            else:
                root_path = root
                dmeta = get_dir_meta(worker,
                                     root_path,
                                     cliargs,
                                     reindex_dict,
                                     statsembeded=False)

            if dmeta:
                # no files in batch, get them with scandir
                if cliargs['dirsonly']:
                    for entry in scandir(root):
                        if entry.is_file(
                                follow_symlinks=False) and not file_excluded(
                                    entry.name):
                            files.append(entry.name)
                filecount = 0
                # check if the directory has a ton of files in it and farm out meta collection to other worker bots
                files_count = len(files)
                if cliargs['splitfiles'] and files_count >= cliargs[
                        'splitfilesnum']:
                    fmetas = []
                    for filelist in split_list(files,
                                               int(files_count / num_workers)):
                        fmetas.append(
                            q_crawl.enqueue(file_meta_collector,
                                            args=(
                                                filelist,
                                                root_path,
                                                statsembeded,
                                                cliargs,
                                                reindex_dict,
                                            ),
                                            result_ttl=config['redis_ttl']))
                    n = 0
                    while n < len(fmetas):
                        if fmetas[n].result:
                            for fmeta in fmetas[n].result:
                                if fmeta:
                                    tree_files.append(fmeta)
                                    filecount += 1
                            n += 1
                    del fmetas[:]
                else:
                    for file in files:
                        filenames.append(file[0])
                        if statsembeded:
                            fmeta = get_file_meta(worker,
                                                  file,
                                                  cliargs,
                                                  reindex_dict,
                                                  statsembeded=True)
                        else:
                            fmeta = get_file_meta(worker,
                                                  os.path.join(
                                                      root_path, file),
                                                  cliargs,
                                                  reindex_dict,
                                                  statsembeded=False)
                        if fmeta:
                            tree_files.append(fmeta)
                            filecount += 1

                # update crawl time
                elapsed = time.time() - starttime
                dmeta['crawl_time'] = round(elapsed, 6)
                # check for empty dirs and dirsonly cli arg
                if cliargs['indexemptydirs']:
                    tree_dirs.append(dmeta)
                elif not cliargs['indexemptydirs'] and (len(dirs) > 0
                                                        or filecount > 0):
                    tree_dirs.append(dmeta)
                totalcrawltime += elapsed

            # check if doc count is more than es chunksize and bulk add to es
            if len(tree_dirs) + len(tree_files) >= config['es_chunksize']:
                es_bulk_add(worker, tree_dirs, tree_files, cliargs,
                            totalcrawltime)
                del tree_dirs[:]
                del tree_files[:]
                totalcrawltime = 0

        # bulk add to es
        if len(tree_dirs) > 0 or len(tree_files) > 0:
            es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime)

        print('%s | processed %d files' % (datetime.now(), len(filenames)))
        return True, filenames
    except Exception as e:
        print('%s | error | %s' % (datetime.now(), e))
        return False, []
def socket_thread_handler_twc(threadnum, q, q_kill, rootdir, num_sep, level,
                              batchsize, cliargs, logger, reindex_dict):
    """This is the socket thread handler tree walk client function.
    Stream of directory listings (pickle) from diskover treewalk
    client connections are enqueued to redis rq queue.
    """

    while True:

        try:

            c = q.get()
            clientsock, addr = c
            logger.debug(clientsock)
            logger.debug(addr)

            while True:
                data = recv_one_message(clientsock)
                #logger.debug(data)

                if not data:
                    break

                if data == b'SIGKILL' or data == 'SIGKILL':
                    q_kill.put(b'SIGKILL')
                    break

                data_decoded = pickle.loads(data)
                logger.debug(data_decoded)

                # enqueue to redis
                batch = []
                for root, dirs, files in data_decoded:
                    if len(dirs) == 0 and len(
                            files) == 0 and not cliargs['indexemptydirs']:
                        continue
                    # check if meta stat data has been embeded in the data from client
                    if type(root) is tuple:
                        rootpath = root[0]
                    else:
                        rootpath = root
                    if not dir_excluded(rootpath, config, cliargs['verbose']):
                        batch.append((root, dirs, files))
                        batch_len = len(batch)
                        if batch_len >= batchsize:
                            q_crawl.enqueue(scrape_tree_meta,
                                            args=(
                                                batch,
                                                cliargs,
                                                reindex_dict,
                                            ))
                            del batch[:]
                            if cliargs['adaptivebatch']:
                                batchsize = adaptive_batch(
                                    q_crawl, cliargs, batchsize)

                        # check if at maxdepth level and delete dirs/files lists to not
                        # descend further down the tree
                        num_sep_this = rootpath.count(os.path.sep)
                        if num_sep + level <= num_sep_this:
                            del dirs[:]
                            del files[:]

                    else:  # directory excluded
                        del dirs[:]
                        del files[:]

                if len(batch) > 0:
                    # add any remaining in batch to queue
                    q_crawl.enqueue(scrape_tree_meta,
                                    args=(
                                        batch,
                                        cliargs,
                                        reindex_dict,
                                    ))
                    del batch[:]

            # close connection to client
            clientsock.close()
            logger.info("[thread-%s]: %s closed connection" %
                        (threadnum, str(addr)))
            q.task_done()

        except socket.error as e:
            logger.error("[thread-%s]: Socket error (%s)" % (threadnum, e))
def scrape_tree_meta(paths, cliargs, reindex_dict):
    global worker
    tree_dirs = []
    tree_files = []
    totalcrawltime = 0
    num_workers = len(SimpleWorker.all(connection=redis_conn))

    for path in paths:
        starttime = time.time()
        root, dirs, files = path

        # check if dirchunk or stats embeded in data from
        # diskover tree walk client or crawlapi
        if type(root) is tuple:
            if root[1] == 'dchunk':
                dirchunk = True
                statsembeded = False
            else:
                statsembeded = True
                dirchunk = False
        else:
            statsembeded = False
            dirchunk = False

        if statsembeded:
            root_path = root[0]
            dmeta = get_dir_meta(worker,
                                 root,
                                 cliargs,
                                 reindex_dict,
                                 statsembeded=True)
        else:
            if dirchunk:
                root_path = root[0]
                dmeta = {'chunkpath': root_path}
            else:
                root_path = root
                dmeta = get_dir_meta(worker,
                                     root_path,
                                     cliargs,
                                     reindex_dict,
                                     statsembeded=False)

        if dmeta:
            filecount = 0
            # check if the directory has a ton of files in it and farm out meta collection to other worker bots
            files_count = len(files)
            if cliargs[
                    'splitfiles'] and files_count >= cliargs['splitfilesnum']:
                fmetas = []
                for filelist in split_list(files,
                                           int(files_count / num_workers)):
                    fmetas.append(
                        q_crawl.enqueue(file_meta_collector,
                                        args=(
                                            filelist,
                                            root_path,
                                            statsembeded,
                                            cliargs,
                                            reindex_dict,
                                        ),
                                        result_ttl=config['redis_ttl']))
                n = 0
                while n < len(fmetas):
                    if fmetas[n].result:
                        for fmeta in fmetas[n].result:
                            if fmeta:
                                tree_files.append(fmeta)
                                filecount += 1
                        n += 1
                    else:
                        time.sleep(.05)
                del fmetas[:]
            else:
                for file in files:
                    if statsembeded:
                        fmeta = get_file_meta(worker,
                                              file,
                                              cliargs,
                                              reindex_dict,
                                              statsembeded=True)
                    else:
                        fmeta = get_file_meta(worker,
                                              os.path.join(root_path, file),
                                              cliargs,
                                              reindex_dict,
                                              statsembeded=False)
                    if fmeta:
                        tree_files.append(fmeta)
                        filecount += 1

            # update crawl time
            elapsed = time.time() - starttime
            dmeta['crawl_time'] = round(elapsed, 6)
            # check for empty dirs
            if cliargs['indexemptydirs']:
                tree_dirs.append(dmeta)
            elif not cliargs['indexemptydirs'] and (len(dirs) > 0
                                                    or filecount > 0):
                tree_dirs.append(dmeta)
            totalcrawltime += elapsed

        # check if doc count is more than es chunksize and bulk add to es
        if len(tree_dirs) + len(tree_files) >= config['es_chunksize']:
            es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime)
            del tree_dirs[:]
            del tree_files[:]
            totalcrawltime = 0

    # bulk add to es
    if len(tree_dirs) > 0 or len(tree_files) > 0:
        es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime)
Example #5
0
def scrape_tree_meta(paths, cliargs, reindex_dict):
    worker = get_worker_name()
    tree_dirs = []
    tree_files = []
    if cliargs['qumulo']:
        qumulo = True
        from diskover_qumulo import qumulo_get_dir_meta, qumulo_get_file_meta
    else:
        qumulo = False
    totalcrawltime = 0

    # check if other bots are idle and throw them some jobs (dir paths)
    if len(paths) >= cliargs['batchsize']:
        workers_idle = 0
        workers = Worker.all(connection=redis_conn)
        num_workers = len(workers)
        for w in workers:
            if w._state == "idle":
                workers_idle += 1
            if workers_idle > num_workers // 2:
                workers_idle = True
                break
        q_len = len(q_crawl)
        if q_len == 0 and workers_idle == True:
            # take half the paths randomly
            shuffle(paths)
            n = len(paths) // 2
            tosspaths = paths[:n]
            paths = paths[n:]
            q_crawl.enqueue(scrape_tree_meta,
                            args=(
                                tosspaths,
                                cliargs,
                                reindex_dict,
                            ))

    for path in paths:
        starttime = time.time()
        root, dirs, files = path
        totaldirsize = 0
        totaldiritems_subdirs = len(dirs)
        totaldiritems_files = 0

        # check if stats embeded in data from diskover tree walk client
        if type(root) is tuple:
            statsembeded = True
        else:
            statsembeded = False
        if qumulo:
            if root['path'] != '/':
                root_path = root['path'].rstrip(os.path.sep)
            else:
                root_path = root['path']
            dmeta = qumulo_get_dir_meta(worker, root, cliargs, reindex_dict,
                                        redis_conn)
        else:
            if statsembeded:
                root_path = root[0]
                dmeta = get_dir_meta(worker,
                                     root,
                                     cliargs,
                                     reindex_dict,
                                     statsembeded=True)
            else:
                root_path = root
                dmeta = get_dir_meta(worker,
                                     root_path,
                                     cliargs,
                                     reindex_dict,
                                     statsembeded=False)
        if dmeta == "sametimes":
            # fetch meta data for directory and all it's files (doc sources) from index2 since
            # directory times haven't changed
            dir_source, files_source = get_metadata(root_path, cliargs)
            datenow = datetime.utcnow().isoformat()
            for file_source in files_source:
                # update indexed at time
                file_source['indexing_date'] = datenow
                # update worker name
                file_source['worker_name'] = worker
                tree_files.append(('file', file_source))
            if dir_source:
                # update indexed at time
                dir_source['indexing_date'] = datenow
                # update worker name
                dir_source['worker_name'] = worker
                # update crawl time
                elapsed = time.time() - starttime
                dir_source['crawl_time'] = round(elapsed, 6)
                tree_dirs.append(dir_source)
                totalcrawltime += elapsed
        # get meta off disk since times different in Redis than on disk
        elif dmeta:
            # check if meta for files embeded
            if statsembeded:
                for file in files:
                    fmeta = get_file_meta(worker,
                                          file,
                                          cliargs,
                                          reindex_dict,
                                          statsembeded=True)
                    if fmeta:
                        tree_files.append(fmeta)
                        # add file size to totaldirsize
                        totaldirsize += fmeta['filesize']
                        totaldiritems_files += 1
            else:
                for file in files:
                    if qumulo:
                        fmeta = qumulo_get_file_meta(worker, file, cliargs,
                                                     reindex_dict)
                    else:
                        fmeta = get_file_meta(worker,
                                              os.path.join(root_path, file),
                                              cliargs,
                                              reindex_dict,
                                              statsembeded=False)
                    if fmeta:
                        tree_files.append(fmeta)
                        # add file size to totaldirsize
                        totaldirsize += fmeta['filesize']
                        totaldiritems_files += 1

            # update crawl time
            elapsed = time.time() - starttime
            dmeta['crawl_time'] = round(elapsed, 6)
            # update directory meta filesize, items
            dmeta['filesize'] = totaldirsize
            dmeta['items_files'] = totaldiritems_files
            dmeta['items_subdirs'] = totaldiritems_subdirs
            totaldiritems = totaldiritems_files + totaldiritems_subdirs
            dmeta['items'] += totaldiritems
            tree_dirs.append(dmeta)
            totalcrawltime += elapsed

        # check if doc count is more than es chunksize and bulk add to es
        if len(tree_dirs) + len(tree_files) >= config['es_chunksize']:
            td = tree_dirs[:]
            tf = tree_files[:]
            es_bulk_add(worker, td, tf, cliargs, totalcrawltime)
            del tree_dirs[:]
            del tree_files[:]
            totalcrawltime = 0

    # bulk add to es
    if len(tree_dirs) > 0 or len(tree_files) > 0:
        es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime)
Example #6
0
def calc_dir_size(dirlist, cliargs):
    """This is the calculate directory size worker function.
    It gets a directory list from the Queue and searches ES for all
    subdirs in each directory (recursive) and sums their filesize and
    items fields to create a total filesize and item count for each directory doc.
    Updates directory doc's filesize and items fields.
    """

    # check if other bots are idle and throw them some jobs (dir paths)
    if len(dirlist) >= cliargs['batchsize']:
        workers_idle = 0
        workers = Worker.all(connection=redis_conn)
        num_workers = len(workers)
        for w in workers:
            if w._state == "idle":
                workers_idle += 1
            if workers_idle > num_workers // 2:
                workers_idle = True
                break
        q_len = len(q_calc)
        if q_len == 0 and workers_idle == True:
            # take half the paths randomly
            shuffle(dirlist)
            n = len(dirlist) // 2
            tossdirs = dirlist[:n]
            dirlist = dirlist[n:]
            q_crawl.enqueue(calc_dir_size, args=(
                tossdirs,
                cliargs,
            ))

    doclist = []
    for path in dirlist:
        totalitems = 1  # 1 for itself
        # file doc search with aggregate for sum filesizes
        # escape special characters
        newpath = escape_chars(path[1])
        parentpath = escape_chars(
            os.path.abspath(os.path.join(path[1], os.pardir)))
        pathbasename = escape_chars(os.path.basename(path[1]))

        # create wildcard string and check for / (root) path
        if newpath == '\/':
            newpathwildcard = '\/*'
        else:
            newpathwildcard = newpath + '\/*'

        # check if / (root) path
        if newpath == '\/':
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        "query": "path_parent: " + newpath + "*",
                        "analyze_wildcard": "true"
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    }
                }
            }
        else:
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        'query':
                        '(path_parent: ' + parentpath + ' AND filename: ' +
                        pathbasename + ') OR path_parent: ' + newpath +
                        ' OR path_parent: ' + newpathwildcard,
                        'analyze_wildcard':
                        'true'
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    },
                    "total_files": {
                        "sum": {
                            "field": "items_files"
                        }
                    },
                    "total_subdirs": {
                        "sum": {
                            "field": "items_subdirs"
                        }
                    }
                }
            }

        # search ES and start scroll for all directory doc search (subdirs)
        res = es.search(index=cliargs['index'],
                        doc_type='directory',
                        body=data,
                        request_timeout=config['es_timeout'])

        # total file size sum
        totalsize = res['aggregations']['total_size']['value']

        # total items sum for all subdirs count
        totalitems_subdirs = res['aggregations']['total_subdirs']['value']

        # total items sum for all files count
        totalitems_files = res['aggregations']['total_files']['value']

        totalitems += totalitems_subdirs + totalitems_files

        # update filesize and items fields for directory (path) doc
        d = {
            '_op_type': 'update',
            '_index': cliargs['index'],
            '_type': 'directory',
            '_id': path[0],
            'doc': {
                'filesize': totalsize,
                'items': totalitems,
                'items_files': totalitems_files,
                'items_subdirs': totalitems_subdirs
            }
        }
        doclist.append(d)

    index_bulk_add(es, doclist, config, cliargs)