Ejemplo n.º 1
0
def scrape_tree_meta(paths, cliargs, reindex_dict):
    bot_logger = bot_log_setup(cliargs)
    jobstart = time.time()
    tree = []
    if cliargs['qumulo']:
        import diskover_qumulo

    for path in paths:
        starttime = time.time()
        root, files = path
        if cliargs['qumulo']:
            if root['path'] != '/':
                root_path = root['path'].rstrip(os.path.sep)
            else:
                root_path = root['path']
            dmeta = diskover_qumulo.qumulo_get_dir_meta(
                root, cliargs, reindex_dict, bot_logger, redis_conn)
        else:
            root_path = root
            dmeta = get_dir_meta(root, cliargs, reindex_dict, bot_logger)
        if dmeta == "sametimes":
            # fetch meta data for directory and all it's files (doc sources) from index2 since
            # directory times haven't changed
            dir_source, files_source = get_metadata(root_path, cliargs)
            worker = get_worker_name()
            datenow = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")
            for file_source in files_source:
                # update indexed at time
                file_source['indexing_date'] = datenow
                # update worker name
                file_source['worker_name'] = worker
                tree.append(('file', file_source))
            if dir_source:
                # update indexed at time
                dir_source['indexing_date'] = datenow
                # update worker name
                dir_source['worker_name'] = worker
                tree.append(('directory', dir_source))
                tree.append(
                    ('crawltime', root_path, (time.time() - starttime)))
        else:  # get meta off disk since times different in Redis than on disk
            for file in files:
                if cliargs['qumulo']:
                    fmeta = diskover_qumulo.qumulo_get_file_meta(
                        file, cliargs, reindex_dict, bot_logger)
                else:
                    fmeta = get_file_meta(os.path.join(root, file), cliargs,
                                          reindex_dict, bot_logger)
                if fmeta:
                    tree.append(('file', fmeta))
            if dmeta:
                tree.append(('directory', dmeta))
                tree.append(
                    ('crawltime', root_path, (time.time() - starttime)))

    if len(tree) > 0:
        es_bulk_adder(tree, cliargs, bot_logger)

    elapsed_time = round(time.time() - jobstart, 3)
    bot_logger.info('*** FINISHED JOB, Elapsed Time: ' + str(elapsed_time))
Ejemplo n.º 2
0
def file_scraper(file_in_thread_q, file_out_thread_q):
    while True:
        item = file_in_thread_q.get()
        worker, path, cliargs, reindex_dict = item
        if cliargs['qumulo']:
            import diskover_qumulo
            fmeta = diskover_qumulo.qumulo_get_file_meta(worker, path, cliargs, reindex_dict)
        else:
            fmeta = get_file_meta(worker, path, cliargs, reindex_dict)
        if fmeta:
            file_out_thread_q.put(fmeta)
        file_in_thread_q.task_done()
Ejemplo n.º 3
0
def file_meta_collector():
    while True:
        item = filequeue.get()
        worker_name, path, cliargs, reindex_dict = item
        if cliargs['qumulo']:
            import diskover_qumulo
            meta = diskover_qumulo.qumulo_get_file_meta(worker_name, path, cliargs, reindex_dict)
        else:
            meta = get_file_meta(worker_name, path, cliargs, reindex_dict)
        if meta:
            filequeue_meta.put(meta)
        filequeue.task_done()
Ejemplo n.º 4
0
def scrape_tree_meta(paths, cliargs, reindex_dict):
    global worker
    tree_dirs = []
    tree_files = []
    if cliargs['qumulo']:
        qumulo = True
        from diskover_qumulo import qumulo_get_dir_meta, qumulo_get_file_meta
    else:
        qumulo = False
    totalcrawltime = 0
    statsembeded = False

    path_count = 0
    for path in paths:
        path_count += 1
        starttime = time.time()
        if not cliargs['dirsonly']:
            root, dirs, files = path
        else:
            root, dirs = path
            files = []
        if path_count == 1:
            if type(root) is tuple:
                statsembeded = True
        if qumulo:
            if root['path'] != '/':
                root_path = root['path'].rstrip(os.path.sep)
            else:
                root_path = root['path']
            dmeta = qumulo_get_dir_meta(worker, root, cliargs, reindex_dict, redis_conn)
        # check if stats embeded in data from diskover tree walk client
        elif statsembeded:
            root_path = root[0]
            dmeta = get_dir_meta(worker, root, cliargs, reindex_dict, statsembeded=True)
        else:
            root_path = root
            dmeta = get_dir_meta(worker, root_path, cliargs, reindex_dict, statsembeded=False)

        if dmeta == "sametimes":
            # fetch meta data for directory and all it's files (doc sources) from index2 since
            # directory times haven't changed
            dir_source, files_source = get_metadata(root_path, cliargs)
            datenow = datetime.utcnow().isoformat()
            for file_source in files_source:
                # update indexed at time
                file_source['indexing_date'] = datenow
                # update worker name
                file_source['worker_name'] = worker
                tree_files.append(('file', file_source))
            if dir_source:
                # update indexed at time
                dir_source['indexing_date'] = datenow
                # update worker name
                dir_source['worker_name'] = worker
                # update crawl time
                elapsed = time.time() - starttime
                dir_source['crawl_time'] = round(elapsed, 6)
                tree_dirs.append(dir_source)
                totalcrawltime += elapsed
        # get meta off disk since times different in Redis than on disk
        elif dmeta:
            # no files in batch, get them with scandir
            if cliargs['dirsonly']:
                for entry in scandir(root):
                    if entry.is_file(follow_symlinks=False) and not file_excluded(entry.name):
                        files.append(entry.name)
            filecount = 0
            for file in files:
                if qumulo:
                    fmeta = qumulo_get_file_meta(worker, file, cliargs, reindex_dict)
                elif statsembeded:
                    fmeta = get_file_meta(worker, file, cliargs, reindex_dict, statsembeded=True)
                else:
                    fmeta = get_file_meta(worker, os.path.join(root_path, file), cliargs,
                                         reindex_dict, statsembeded=False)
                if fmeta:
                    tree_files.append(fmeta)
                    filecount += 1

            # update crawl time=
            elapsed = time.time() - starttime
            dmeta['crawl_time'] = round(elapsed, 6)
            # check for empty dirs and dirsonly cli arg
            if cliargs['indexemptydirs']:
                tree_dirs.append(dmeta)
            elif not cliargs['indexemptydirs'] and (len(dirs) > 0 or filecount > 0):
                tree_dirs.append(dmeta)
            totalcrawltime += elapsed

        # check if doc count is more than es chunksize and bulk add to es
        if len(tree_dirs) + len(tree_files) >= config['es_chunksize']:
            es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime)
            del tree_dirs[:]
            del tree_files[:]
            totalcrawltime = 0

    # bulk add to es
    if len(tree_dirs) > 0 or len(tree_files) > 0:
        es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime)
Ejemplo n.º 5
0
def scrape_tree_meta(paths, cliargs, reindex_dict):
    jobstart = time.time()
    worker = get_worker_name()
    tree_dirs = []
    tree_files = []
    tree_crawltimes = []
    qumulo = cliargs['qumulo']
    totalcrawltime = 0
    # amount of time (sec) before starting threads to help crawl files
    filethreadtime = diskover.config['filethreadtime']

    for path in paths:
        threadsstarted = False
        starttime = time.time()
        root, files = path
        if qumulo:
            import diskover_qumulo
            if root['path'] != '/':
                root_path = root['path'].rstrip(os.path.sep)
            else:
                root_path = root['path']
            dmeta = diskover_qumulo.qumulo_get_dir_meta(worker, root, cliargs, reindex_dict, redis_conn)
        else:
            root_path = root
            dmeta = get_dir_meta(worker, root, cliargs, reindex_dict)
        if dmeta == "sametimes":
            # fetch meta data for directory and all it's files (doc sources) from index2 since
            # directory times haven't changed
            dir_source, files_source = get_metadata(root_path, cliargs)
            datenow = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")
            for file_source in files_source:
                # update indexed at time
                file_source['indexing_date'] = datenow
                # update worker name
                file_source['worker_name'] = worker
                tree_files.append(('file', file_source))
            if dir_source:
                # update indexed at time
                dir_source['indexing_date'] = datenow
                # update worker name
                dir_source['worker_name'] = worker
                tree_dirs.append(dir_source)
                elapsed = time.time() - starttime
                tree_crawltimes.append({
                        "path": root_path,
                        "worker_name": worker,
                        "crawl_time": round(elapsed, 10),
                        "indexing_date": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f"),
                        "_type": "crawlstat"})
                totalcrawltime += elapsed
        else:  # get meta off disk since times different in Redis than on disk
            for file in files:
                # spawn threads to help with getting file meta if running long
                if (time.time() - starttime) > filethreadtime:
                    if not threadsstarted:
                        bot_logger.info('*** %s taking more than %s to crawl, starting threads to help scrape file meta'
                                        % (root, filethreadtime))
                        # set up python Queue for threaded file meta scraping
                        file_in_thread_q = pyQueue()
                        file_out_thread_q = pyQueue()
                        start_file_threads(file_in_thread_q, file_out_thread_q)
                    threadsstarted = True
                    if qumulo:
                        file_in_thread_q.put((worker, file, cliargs, reindex_dict))
                    else:
                        file_in_thread_q.put((worker, os.path.join(root, file), cliargs, reindex_dict))
                else:
                    if qumulo:
                        fmeta = diskover_qumulo.qumulo_get_file_meta(worker, file, cliargs, reindex_dict)
                    else:
                        fmeta = get_file_meta(worker, os.path.join(root, file), cliargs, reindex_dict)
                    if fmeta:
                        tree_files.append(fmeta)
            if threadsstarted:
                bot_logger.info('*** Waiting for threads to finish...')
                # wait for threads to finish
                file_in_thread_q.join()
                bot_logger.info('*** Adding file meta thread results for %s' % root)
                # get all files and add to tree_files
                while file_out_thread_q.qsize():
                    tree_files.append(file_out_thread_q.get())
            if dmeta:
                tree_dirs.append(dmeta)
                elapsed = time.time() - starttime
                tree_crawltimes.append({
                    "path": root_path,
                    "worker_name": worker,
                    "crawl_time": round(elapsed, 10),
                    "indexing_date": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f"),
                    "_type": "crawlstat"})
                totalcrawltime += elapsed

    if len(tree_dirs) > 0 or len(tree_files) > 0:
        es_bulk_adder(worker, (tree_dirs, tree_files, tree_crawltimes), cliargs, totalcrawltime)

    elapsed_time = round(time.time() - jobstart, 3)
    bot_logger.info('*** FINISHED JOB, Elapsed Time: ' + str(elapsed_time))
Ejemplo n.º 6
0
def scrape_tree_meta(paths, cliargs, reindex_dict):
    worker = get_worker_name()
    tree_dirs = []
    tree_files = []
    if cliargs['qumulo']:
        qumulo = True
        from diskover_qumulo import qumulo_get_dir_meta, qumulo_get_file_meta
    else:
        qumulo = False
    totalcrawltime = 0

    # check if other bots are idle and throw them some jobs (dir paths)
    if len(paths) >= cliargs['batchsize']:
        workers_idle = 0
        workers = Worker.all(connection=redis_conn)
        num_workers = len(workers)
        for w in workers:
            if w._state == "idle":
                workers_idle += 1
            if workers_idle > num_workers // 2:
                workers_idle = True
                break
        q_len = len(q_crawl)
        if q_len == 0 and workers_idle == True:
            # take half the paths randomly
            shuffle(paths)
            n = len(paths) // 2
            tosspaths = paths[:n]
            paths = paths[n:]
            q_crawl.enqueue(scrape_tree_meta,
                            args=(
                                tosspaths,
                                cliargs,
                                reindex_dict,
                            ))

    for path in paths:
        starttime = time.time()
        root, dirs, files = path
        totaldirsize = 0
        totaldiritems_subdirs = len(dirs)
        totaldiritems_files = 0

        # check if stats embeded in data from diskover tree walk client
        if type(root) is tuple:
            statsembeded = True
        else:
            statsembeded = False
        if qumulo:
            if root['path'] != '/':
                root_path = root['path'].rstrip(os.path.sep)
            else:
                root_path = root['path']
            dmeta = qumulo_get_dir_meta(worker, root, cliargs, reindex_dict,
                                        redis_conn)
        else:
            if statsembeded:
                root_path = root[0]
                dmeta = get_dir_meta(worker,
                                     root,
                                     cliargs,
                                     reindex_dict,
                                     statsembeded=True)
            else:
                root_path = root
                dmeta = get_dir_meta(worker,
                                     root_path,
                                     cliargs,
                                     reindex_dict,
                                     statsembeded=False)
        if dmeta == "sametimes":
            # fetch meta data for directory and all it's files (doc sources) from index2 since
            # directory times haven't changed
            dir_source, files_source = get_metadata(root_path, cliargs)
            datenow = datetime.utcnow().isoformat()
            for file_source in files_source:
                # update indexed at time
                file_source['indexing_date'] = datenow
                # update worker name
                file_source['worker_name'] = worker
                tree_files.append(('file', file_source))
            if dir_source:
                # update indexed at time
                dir_source['indexing_date'] = datenow
                # update worker name
                dir_source['worker_name'] = worker
                # update crawl time
                elapsed = time.time() - starttime
                dir_source['crawl_time'] = round(elapsed, 6)
                tree_dirs.append(dir_source)
                totalcrawltime += elapsed
        # get meta off disk since times different in Redis than on disk
        elif dmeta:
            # check if meta for files embeded
            if statsembeded:
                for file in files:
                    fmeta = get_file_meta(worker,
                                          file,
                                          cliargs,
                                          reindex_dict,
                                          statsembeded=True)
                    if fmeta:
                        tree_files.append(fmeta)
                        # add file size to totaldirsize
                        totaldirsize += fmeta['filesize']
                        totaldiritems_files += 1
            else:
                for file in files:
                    if qumulo:
                        fmeta = qumulo_get_file_meta(worker, file, cliargs,
                                                     reindex_dict)
                    else:
                        fmeta = get_file_meta(worker,
                                              os.path.join(root_path, file),
                                              cliargs,
                                              reindex_dict,
                                              statsembeded=False)
                    if fmeta:
                        tree_files.append(fmeta)
                        # add file size to totaldirsize
                        totaldirsize += fmeta['filesize']
                        totaldiritems_files += 1

            # update crawl time
            elapsed = time.time() - starttime
            dmeta['crawl_time'] = round(elapsed, 6)
            # update directory meta filesize, items
            dmeta['filesize'] = totaldirsize
            dmeta['items_files'] = totaldiritems_files
            dmeta['items_subdirs'] = totaldiritems_subdirs
            totaldiritems = totaldiritems_files + totaldiritems_subdirs
            dmeta['items'] += totaldiritems
            tree_dirs.append(dmeta)
            totalcrawltime += elapsed

        # check if doc count is more than es chunksize and bulk add to es
        if len(tree_dirs) + len(tree_files) >= config['es_chunksize']:
            td = tree_dirs[:]
            tf = tree_files[:]
            es_bulk_add(worker, td, tf, cliargs, totalcrawltime)
            del tree_dirs[:]
            del tree_files[:]
            totalcrawltime = 0

    # bulk add to es
    if len(tree_dirs) > 0 or len(tree_files) > 0:
        es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime)