def scrape_tree_meta(paths, cliargs, reindex_dict): bot_logger = bot_log_setup(cliargs) jobstart = time.time() tree = [] if cliargs['qumulo']: import diskover_qumulo for path in paths: starttime = time.time() root, files = path if cliargs['qumulo']: if root['path'] != '/': root_path = root['path'].rstrip(os.path.sep) else: root_path = root['path'] dmeta = diskover_qumulo.qumulo_get_dir_meta( root, cliargs, reindex_dict, bot_logger, redis_conn) else: root_path = root dmeta = get_dir_meta(root, cliargs, reindex_dict, bot_logger) if dmeta == "sametimes": # fetch meta data for directory and all it's files (doc sources) from index2 since # directory times haven't changed dir_source, files_source = get_metadata(root_path, cliargs) worker = get_worker_name() datenow = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f") for file_source in files_source: # update indexed at time file_source['indexing_date'] = datenow # update worker name file_source['worker_name'] = worker tree.append(('file', file_source)) if dir_source: # update indexed at time dir_source['indexing_date'] = datenow # update worker name dir_source['worker_name'] = worker tree.append(('directory', dir_source)) tree.append( ('crawltime', root_path, (time.time() - starttime))) else: # get meta off disk since times different in Redis than on disk for file in files: if cliargs['qumulo']: fmeta = diskover_qumulo.qumulo_get_file_meta( file, cliargs, reindex_dict, bot_logger) else: fmeta = get_file_meta(os.path.join(root, file), cliargs, reindex_dict, bot_logger) if fmeta: tree.append(('file', fmeta)) if dmeta: tree.append(('directory', dmeta)) tree.append( ('crawltime', root_path, (time.time() - starttime))) if len(tree) > 0: es_bulk_adder(tree, cliargs, bot_logger) elapsed_time = round(time.time() - jobstart, 3) bot_logger.info('*** FINISHED JOB, Elapsed Time: ' + str(elapsed_time))
def file_scraper(file_in_thread_q, file_out_thread_q): while True: item = file_in_thread_q.get() worker, path, cliargs, reindex_dict = item if cliargs['qumulo']: import diskover_qumulo fmeta = diskover_qumulo.qumulo_get_file_meta(worker, path, cliargs, reindex_dict) else: fmeta = get_file_meta(worker, path, cliargs, reindex_dict) if fmeta: file_out_thread_q.put(fmeta) file_in_thread_q.task_done()
def file_meta_collector(): while True: item = filequeue.get() worker_name, path, cliargs, reindex_dict = item if cliargs['qumulo']: import diskover_qumulo meta = diskover_qumulo.qumulo_get_file_meta(worker_name, path, cliargs, reindex_dict) else: meta = get_file_meta(worker_name, path, cliargs, reindex_dict) if meta: filequeue_meta.put(meta) filequeue.task_done()
def scrape_tree_meta(paths, cliargs, reindex_dict): global worker tree_dirs = [] tree_files = [] if cliargs['qumulo']: qumulo = True from diskover_qumulo import qumulo_get_dir_meta, qumulo_get_file_meta else: qumulo = False totalcrawltime = 0 statsembeded = False path_count = 0 for path in paths: path_count += 1 starttime = time.time() if not cliargs['dirsonly']: root, dirs, files = path else: root, dirs = path files = [] if path_count == 1: if type(root) is tuple: statsembeded = True if qumulo: if root['path'] != '/': root_path = root['path'].rstrip(os.path.sep) else: root_path = root['path'] dmeta = qumulo_get_dir_meta(worker, root, cliargs, reindex_dict, redis_conn) # check if stats embeded in data from diskover tree walk client elif statsembeded: root_path = root[0] dmeta = get_dir_meta(worker, root, cliargs, reindex_dict, statsembeded=True) else: root_path = root dmeta = get_dir_meta(worker, root_path, cliargs, reindex_dict, statsembeded=False) if dmeta == "sametimes": # fetch meta data for directory and all it's files (doc sources) from index2 since # directory times haven't changed dir_source, files_source = get_metadata(root_path, cliargs) datenow = datetime.utcnow().isoformat() for file_source in files_source: # update indexed at time file_source['indexing_date'] = datenow # update worker name file_source['worker_name'] = worker tree_files.append(('file', file_source)) if dir_source: # update indexed at time dir_source['indexing_date'] = datenow # update worker name dir_source['worker_name'] = worker # update crawl time elapsed = time.time() - starttime dir_source['crawl_time'] = round(elapsed, 6) tree_dirs.append(dir_source) totalcrawltime += elapsed # get meta off disk since times different in Redis than on disk elif dmeta: # no files in batch, get them with scandir if cliargs['dirsonly']: for entry in scandir(root): if entry.is_file(follow_symlinks=False) and not file_excluded(entry.name): files.append(entry.name) filecount = 0 for file in files: if qumulo: fmeta = qumulo_get_file_meta(worker, file, cliargs, reindex_dict) elif statsembeded: fmeta = get_file_meta(worker, file, cliargs, reindex_dict, statsembeded=True) else: fmeta = get_file_meta(worker, os.path.join(root_path, file), cliargs, reindex_dict, statsembeded=False) if fmeta: tree_files.append(fmeta) filecount += 1 # update crawl time= elapsed = time.time() - starttime dmeta['crawl_time'] = round(elapsed, 6) # check for empty dirs and dirsonly cli arg if cliargs['indexemptydirs']: tree_dirs.append(dmeta) elif not cliargs['indexemptydirs'] and (len(dirs) > 0 or filecount > 0): tree_dirs.append(dmeta) totalcrawltime += elapsed # check if doc count is more than es chunksize and bulk add to es if len(tree_dirs) + len(tree_files) >= config['es_chunksize']: es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime) del tree_dirs[:] del tree_files[:] totalcrawltime = 0 # bulk add to es if len(tree_dirs) > 0 or len(tree_files) > 0: es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime)
def scrape_tree_meta(paths, cliargs, reindex_dict): jobstart = time.time() worker = get_worker_name() tree_dirs = [] tree_files = [] tree_crawltimes = [] qumulo = cliargs['qumulo'] totalcrawltime = 0 # amount of time (sec) before starting threads to help crawl files filethreadtime = diskover.config['filethreadtime'] for path in paths: threadsstarted = False starttime = time.time() root, files = path if qumulo: import diskover_qumulo if root['path'] != '/': root_path = root['path'].rstrip(os.path.sep) else: root_path = root['path'] dmeta = diskover_qumulo.qumulo_get_dir_meta(worker, root, cliargs, reindex_dict, redis_conn) else: root_path = root dmeta = get_dir_meta(worker, root, cliargs, reindex_dict) if dmeta == "sametimes": # fetch meta data for directory and all it's files (doc sources) from index2 since # directory times haven't changed dir_source, files_source = get_metadata(root_path, cliargs) datenow = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f") for file_source in files_source: # update indexed at time file_source['indexing_date'] = datenow # update worker name file_source['worker_name'] = worker tree_files.append(('file', file_source)) if dir_source: # update indexed at time dir_source['indexing_date'] = datenow # update worker name dir_source['worker_name'] = worker tree_dirs.append(dir_source) elapsed = time.time() - starttime tree_crawltimes.append({ "path": root_path, "worker_name": worker, "crawl_time": round(elapsed, 10), "indexing_date": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f"), "_type": "crawlstat"}) totalcrawltime += elapsed else: # get meta off disk since times different in Redis than on disk for file in files: # spawn threads to help with getting file meta if running long if (time.time() - starttime) > filethreadtime: if not threadsstarted: bot_logger.info('*** %s taking more than %s to crawl, starting threads to help scrape file meta' % (root, filethreadtime)) # set up python Queue for threaded file meta scraping file_in_thread_q = pyQueue() file_out_thread_q = pyQueue() start_file_threads(file_in_thread_q, file_out_thread_q) threadsstarted = True if qumulo: file_in_thread_q.put((worker, file, cliargs, reindex_dict)) else: file_in_thread_q.put((worker, os.path.join(root, file), cliargs, reindex_dict)) else: if qumulo: fmeta = diskover_qumulo.qumulo_get_file_meta(worker, file, cliargs, reindex_dict) else: fmeta = get_file_meta(worker, os.path.join(root, file), cliargs, reindex_dict) if fmeta: tree_files.append(fmeta) if threadsstarted: bot_logger.info('*** Waiting for threads to finish...') # wait for threads to finish file_in_thread_q.join() bot_logger.info('*** Adding file meta thread results for %s' % root) # get all files and add to tree_files while file_out_thread_q.qsize(): tree_files.append(file_out_thread_q.get()) if dmeta: tree_dirs.append(dmeta) elapsed = time.time() - starttime tree_crawltimes.append({ "path": root_path, "worker_name": worker, "crawl_time": round(elapsed, 10), "indexing_date": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f"), "_type": "crawlstat"}) totalcrawltime += elapsed if len(tree_dirs) > 0 or len(tree_files) > 0: es_bulk_adder(worker, (tree_dirs, tree_files, tree_crawltimes), cliargs, totalcrawltime) elapsed_time = round(time.time() - jobstart, 3) bot_logger.info('*** FINISHED JOB, Elapsed Time: ' + str(elapsed_time))
def scrape_tree_meta(paths, cliargs, reindex_dict): worker = get_worker_name() tree_dirs = [] tree_files = [] if cliargs['qumulo']: qumulo = True from diskover_qumulo import qumulo_get_dir_meta, qumulo_get_file_meta else: qumulo = False totalcrawltime = 0 # check if other bots are idle and throw them some jobs (dir paths) if len(paths) >= cliargs['batchsize']: workers_idle = 0 workers = Worker.all(connection=redis_conn) num_workers = len(workers) for w in workers: if w._state == "idle": workers_idle += 1 if workers_idle > num_workers // 2: workers_idle = True break q_len = len(q_crawl) if q_len == 0 and workers_idle == True: # take half the paths randomly shuffle(paths) n = len(paths) // 2 tosspaths = paths[:n] paths = paths[n:] q_crawl.enqueue(scrape_tree_meta, args=( tosspaths, cliargs, reindex_dict, )) for path in paths: starttime = time.time() root, dirs, files = path totaldirsize = 0 totaldiritems_subdirs = len(dirs) totaldiritems_files = 0 # check if stats embeded in data from diskover tree walk client if type(root) is tuple: statsembeded = True else: statsembeded = False if qumulo: if root['path'] != '/': root_path = root['path'].rstrip(os.path.sep) else: root_path = root['path'] dmeta = qumulo_get_dir_meta(worker, root, cliargs, reindex_dict, redis_conn) else: if statsembeded: root_path = root[0] dmeta = get_dir_meta(worker, root, cliargs, reindex_dict, statsembeded=True) else: root_path = root dmeta = get_dir_meta(worker, root_path, cliargs, reindex_dict, statsembeded=False) if dmeta == "sametimes": # fetch meta data for directory and all it's files (doc sources) from index2 since # directory times haven't changed dir_source, files_source = get_metadata(root_path, cliargs) datenow = datetime.utcnow().isoformat() for file_source in files_source: # update indexed at time file_source['indexing_date'] = datenow # update worker name file_source['worker_name'] = worker tree_files.append(('file', file_source)) if dir_source: # update indexed at time dir_source['indexing_date'] = datenow # update worker name dir_source['worker_name'] = worker # update crawl time elapsed = time.time() - starttime dir_source['crawl_time'] = round(elapsed, 6) tree_dirs.append(dir_source) totalcrawltime += elapsed # get meta off disk since times different in Redis than on disk elif dmeta: # check if meta for files embeded if statsembeded: for file in files: fmeta = get_file_meta(worker, file, cliargs, reindex_dict, statsembeded=True) if fmeta: tree_files.append(fmeta) # add file size to totaldirsize totaldirsize += fmeta['filesize'] totaldiritems_files += 1 else: for file in files: if qumulo: fmeta = qumulo_get_file_meta(worker, file, cliargs, reindex_dict) else: fmeta = get_file_meta(worker, os.path.join(root_path, file), cliargs, reindex_dict, statsembeded=False) if fmeta: tree_files.append(fmeta) # add file size to totaldirsize totaldirsize += fmeta['filesize'] totaldiritems_files += 1 # update crawl time elapsed = time.time() - starttime dmeta['crawl_time'] = round(elapsed, 6) # update directory meta filesize, items dmeta['filesize'] = totaldirsize dmeta['items_files'] = totaldiritems_files dmeta['items_subdirs'] = totaldiritems_subdirs totaldiritems = totaldiritems_files + totaldiritems_subdirs dmeta['items'] += totaldiritems tree_dirs.append(dmeta) totalcrawltime += elapsed # check if doc count is more than es chunksize and bulk add to es if len(tree_dirs) + len(tree_files) >= config['es_chunksize']: td = tree_dirs[:] tf = tree_files[:] es_bulk_add(worker, td, tf, cliargs, totalcrawltime) del tree_dirs[:] del tree_files[:] totalcrawltime = 0 # bulk add to es if len(tree_dirs) > 0 or len(tree_files) > 0: es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime)