def start_crawlbot_scanner(cliargs, logger, rootdir_path, botdirlist, reindex_dict): """This is the start crawl bot continuous scanner function. It gets a list with all the directory docs from index_get_docs which contains paths and their mtimes. The list is randomly shuffled. """ global dirlist dirlist = botdirlist logger.info( 'diskover crawl bot continuous scanner starting up (--crawlbot)') logger.info('Randomly scanning for changes every %s sec using %s threads', config['crawlbot_botsleep'], config['crawlbot_botthreads']) logger.info('*** Press Ctrl-c to shutdown ***') threadlist = [] try: for i in range(config['crawlbot_botthreads']): thread = threading.Thread(target=bot_thread, args=( i, cliargs, logger, rootdir_path, reindex_dict, )) thread.daemon = True threadlist.append(thread) thread.start() starttime = time.time() # start infinite loop and randomly pick directories from dirlist # in future will create better algorithm for this while True: # every x seconds get a new dirlist to pick up any new directories which have been added # every x seconds update disk space info in es index # every x seconds calculate directory sizes time.sleep(config['crawlbot_dirlisttime']) t = time.time() elapsed = get_time(t - starttime) logger.info( '*** crawlbot: getting new dirlist from ES, crawlbot has been running for %s', elapsed) lock.acquire(True) dirlist = index_get_docs(cliargs, logger, doctype='directory', index=cliargs['index']) lock.release() # add disk space info to es index add_diskspace(cliargs['index'], logger, rootdir_path) # calculate director sizes and items calc_dir_sizes(cliargs, logger) except KeyboardInterrupt: print('Ctrl-c keyboard interrupt, shutting down...') dirlist = None sys.exit(0)
def bot_thread(threadnum, cliargs, logger, mpq, mpq_lock, totaljobs, rootdir_path, reindex_dict): """This is the bot thread function. It grabs a directory and it's mtime from the Queue. Directory mtime on disk is checked and if newer it is reindexed (non-recursive). """ starttime = time.time() t = time.time() c = 0 n = 0 s = 0 last_path = '' while True: if time.time() - t >= 60: t = diskover.get_time(time.time() - starttime) # display stats if 1 min elapsed logger.info( '### crawlbot thread-%s: %s dirs checked (%s dir/s), %s dirs updated, %s same dir hits, running for %s ###', threadnum, n, round(n / (time.time() - starttime), 2), c, s, t) t = time.time() # break if dirlist is None if dirlist is None: break else: # random pick from dirlist i = len(dirlist) - 1 li = randint(0, i) path = dirlist[li][1] mtime_utc = dirlist[li][2] # pick a new path if same as last time if path == last_path: s += 1 continue last_path = path # check directory's mtime on disk try: mtime_now_utc = time.mktime(time.gmtime(os.lstat(path).st_mtime)) except (IOError, OSError): if cliargs['verbose']: logger.info('Error crawling directory %s' % path) continue if (mtime_now_utc == mtime_utc): if cliargs['verbose']: logger.info('Mtime unchanged: %s' % path) else: c += 1 logger.info('*** Mtime changed! Reindexing: %s' % path) # delete existing path docs (non-recursive) reindex_dict = diskover.index_delete_path(path, cliargs, logger, reindex_dict) # start crawling diskover.crawl_tree(path, cliargs, logger, mpq, mpq_lock, totaljobs, reindex_dict) # calculate directory size for path diskover.calc_dir_sizes(cliargs, logger, path=path) time.sleep(diskover.config['botsleep']) n += 1
def run_command(threadnum, command_dict, clientsock, cliargs, logger): """This is the run command function. It runs commands from the listener socket using values in command_dict. """ global socket_tasks global clientlist # try to get index name from command or use from diskover config file try: index = str(command_dict['index']) except KeyError: index = str(config['index']) pass # try to get min days mtime from command or use default try: mtime = str(command_dict['mtime']) except KeyError: mtime = str(cliargs['mtime']) pass # try to get min size from command or use default try: minsize = str(command_dict['minsize']) except KeyError: minsize = str(cliargs['minsize']) pass # try to get worker batch size from command or use default try: batchsize = str(command_dict['batchsize']) except KeyError: batchsize = str(cliargs['batchsize']) pass # try to get adaptive batch option from command or use default try: adaptivebatch = str(command_dict['adaptivebatch']) except KeyError: adaptivebatch = str(cliargs['adaptivebatch']) pass # try to get optimize index option from command or use default try: optimizeindex = str(command_dict['optimizeindex']) except KeyError: optimizeindex = str(cliargs['optimizeindex']) pass # try to get auto tag option from command or use default try: autotag = str(command_dict['autotag']) except KeyError: autotag = str(cliargs['autotag']) pass # try to get empty dirs option from command or use default try: indexemptydirs = str(command_dict['indexemptydirs']) except KeyError: indexemptydirs = str(cliargs['indexemptydirs']) pass try: action = command_dict['action'] pythonpath = config['python_path'] diskoverpath = config['diskover_path'] # set up command for different action if action == 'crawl': path = command_dict['path'] cmd = [ pythonpath, diskoverpath, '-b', batchsize, '-i', index, '-d', path, '-m', mtime, '-s', minsize, '-q', '-F' ] elif action == 'finddupes': cmd = [ pythonpath, diskoverpath, '-b', batchsize, '-i', index, '--finddupes', '-q', '-F' ] elif action == 'hotdirs': index2 = str(command_dict['index2']) cmd = [ pythonpath, diskoverpath, '-b', batchsize, '-i', index, '--hotdirs', index2, '-q', '-F' ] elif action == 'reindex': try: recursive = command_dict['recursive'] except KeyError: recursive = 'false' pass path = command_dict['path'] if recursive == 'true': cmd = [ pythonpath, diskoverpath, '-b', batchsize, '-i', index, '-d', path, '-R', '-q', '-F' ] else: cmd = [ pythonpath, diskoverpath, '-b', batchsize, '-i', index, '-d', path, '-r', '-q', '-F' ] elif action == 'updatedirsizes': try: recursive = command_dict['recursive'] except KeyError: recursive = 'false' pass if recursive == 'true': cmd = [ pythonpath, diskoverpath, '-b', batchsize, '-i', index, '--dircalcsonly', '-q', '-F' ] else: path = command_dict['path'] cmd = [ pythonpath, diskoverpath, '-b', batchsize, '-i', index, '-d', path, '--dircalcsonly', '--maxdcdepth', '0', '-q', '-F' ] elif action == 'kill': taskid = command_dict['taskid'] logger.info("[thread-%s]: Kill task message received! (taskid:%s)", threadnum, taskid) # do something here to kill task (future) message = b'{"msg": "taskkilled"}\n' clientsock.send(message) return else: logger.warning("Unknown action") message = b'{"error": "unknown action"}\n' clientsock.send(message) return # add adaptive batch if (adaptivebatch == "True" or adaptivebatch == "true"): cmd.append('-a') # add optimize index if (optimizeindex == "True" or optimizeindex == "true"): cmd.append('-O') # add auto tags if (autotag == "True" or autotag == "true"): cmd.append('-A') # add index empty dirs if (indexemptydirs == "True" or indexemptydirs == "true"): cmd.append('-e') # run command using subprocess starttime = time.time() taskid = str(uuid.uuid4()).encode('utf-8') # start process process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # add process to socket_tasks dict socket_tasks[taskid] = process message = b'{"msg": "taskstart", "taskid": "' + taskid + b'"}\n' clientsock.send(message) logger.info("[thread-%s]: Running command (taskid:%s)", threadnum, taskid.decode('utf-8')) logger.info(cmd) output, error = process.communicate() # send exit msg to client exitcode = str(process.returncode).encode('utf-8') logger.debug('Command output:') logger.debug(output.decode('utf-8')) logger.debug('Command error:') logger.debug(error.decode('utf-8')) elapsedtime = str(get_time(time.time() - starttime)).encode('utf-8') logger.info( "Finished command (taskid:%s), exit code: %s, elapsed time: %s" % (taskid.decode('utf-8'), exitcode.decode('utf-8'), elapsedtime.decode('utf-8'))) message = b'{"msg": "taskfinish", "taskid": "%s", "exitcode": %s, "elapsedtime": "%s"}\n' \ % (taskid, exitcode, elapsedtime) clientsock.send(message) except ValueError: logger.warning("Value error") message = b'{"error": "value error"}\n' clientsock.send(message) pass except socket.error as e: logger.error("[thread-%s]: Socket error (%s)" % (threadnum, e)) pass
def run_command(threadnum, command_dict, clientsock, cliargs, logger, verbose): """This is the run command function. It runs commands from the listener socket using values in command_dict. """ global socket_tasks global clientlist # try to get index name from command or use from diskover config file try: index = command_dict['index'] except KeyError: index = diskover.config['index'] pass # try to get worker batch size from command or use default try: batchsize = str(command_dict['batchsize']) except KeyError: batchsize = str(cliargs['batchsize']) pass try: action = command_dict['action'] pythonpath = diskover.config['python_path'] diskoverpath = diskover.config['diskover_path'] # set up command for different action if action == 'crawl': path = command_dict['path'] cmd = [ pythonpath, '-u', diskoverpath, '-b', batchsize, '-i', index, '-d', path, '-q' ] elif action == 'finddupes': cmd = [ pythonpath, '-u', diskoverpath, '-b', batchsize, '-i', index, '-D', '-q' ] elif action == 'reindex': try: recursive = command_dict['recursive'] except KeyError: recursive = 'false' pass path = command_dict['path'] if recursive == 'true': cmd = [ pythonpath, '-u', diskoverpath, '-b', batchsize, '-i', index, '-d', path, '-R', '-q' ] else: cmd = [ pythonpath, '-u', diskoverpath, '-b', batchsize, '-i', index, '-d', path, '-r', '-q' ] elif action == 'kill': taskid = command_dict['taskid'] logger.info("[thread-%s]: Kill task message received! (taskid:%s)", threadnum, taskid) # do something here to kill task (future) message = b'{"msg": "taskkilled"}\n' clientsock.send(message) return else: logger.warning("Unknown action") message = b'{"error": "unknown action"}\n' clientsock.send(message) return # run command using subprocess starttime = time.time() taskid = str(uuid.uuid4()).encode('utf-8') # start process process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # add process to socket_tasks dict socket_tasks[taskid] = process message = b'{"msg": "taskstart", "taskid": "' + taskid + b'"}\n' clientsock.send(message) logger.info("[thread-%s]: Running command (taskid:%s)", threadnum, taskid.decode('utf-8')) logger.info(cmd) output, error = process.communicate() # send exit msg to client exitcode = str(process.returncode).encode('utf-8') logger.debug('Command output:') logger.debug(output.decode('utf-8')) logger.debug('Command error:') logger.debug(error.decode('utf-8')) elapsedtime = str(diskover.get_time(time.time() - starttime)).encode('utf-8') logger.info( "Finished command (taskid:%s), exit code: %s, elapsed time: %s" % (taskid.decode('utf-8'), exitcode.decode('utf-8'), elapsedtime.decode('utf-8'))) message = b'{"msg": "taskfinish", "taskid": "%s", "exitcode": %s, "elapsedtime": "%s"}\n' \ % (taskid, exitcode, elapsedtime) clientsock.send(message) except ValueError: logger.warning("Value error") message = b'{"error": "value error"}\n' clientsock.send(message) pass except socket.error as e: logger.error("[thread-%s]: Socket error (%s)" % (threadnum, e)) pass