Ejemplo n.º 1
0
def start_crawlbot_scanner(cliargs, logger, rootdir_path, botdirlist,
                           reindex_dict):
    """This is the start crawl bot continuous scanner function.
    It gets a list with all the directory docs from index_get_docs which
    contains paths and their mtimes. The list is randomly shuffled.
    """
    global dirlist
    dirlist = botdirlist

    logger.info(
        'diskover crawl bot continuous scanner starting up (--crawlbot)')
    logger.info('Randomly scanning for changes every %s sec using %s threads',
                config['crawlbot_botsleep'], config['crawlbot_botthreads'])
    logger.info('*** Press Ctrl-c to shutdown ***')

    threadlist = []
    try:
        for i in range(config['crawlbot_botthreads']):
            thread = threading.Thread(target=bot_thread,
                                      args=(
                                          i,
                                          cliargs,
                                          logger,
                                          rootdir_path,
                                          reindex_dict,
                                      ))
            thread.daemon = True
            threadlist.append(thread)
            thread.start()

        starttime = time.time()
        # start infinite loop and randomly pick directories from dirlist
        # in future will create better algorithm for this
        while True:
            # every x seconds get a new dirlist to pick up any new directories which have been added
            # every x seconds update disk space info in es index
            # every x seconds calculate directory sizes
            time.sleep(config['crawlbot_dirlisttime'])
            t = time.time()
            elapsed = get_time(t - starttime)
            logger.info(
                '*** crawlbot: getting new dirlist from ES, crawlbot has been running for %s',
                elapsed)
            lock.acquire(True)
            dirlist = index_get_docs(cliargs,
                                     logger,
                                     doctype='directory',
                                     index=cliargs['index'])
            lock.release()
            # add disk space info to es index
            add_diskspace(cliargs['index'], logger, rootdir_path)
            # calculate director sizes and items
            calc_dir_sizes(cliargs, logger)

    except KeyboardInterrupt:
        print('Ctrl-c keyboard interrupt, shutting down...')
        dirlist = None
        sys.exit(0)
Ejemplo n.º 2
0
def bot_thread(threadnum, cliargs, logger, mpq, mpq_lock, totaljobs,
               rootdir_path, reindex_dict):
    """This is the bot thread function.
    It grabs a directory and it's mtime from the Queue.
    Directory mtime on disk is checked and if newer it is
    reindexed (non-recursive).
    """
    starttime = time.time()
    t = time.time()
    c = 0
    n = 0
    s = 0
    last_path = ''
    while True:
        if time.time() - t >= 60:
            t = diskover.get_time(time.time() - starttime)
            # display stats if 1 min elapsed
            logger.info(
                '### crawlbot thread-%s: %s dirs checked (%s dir/s), %s dirs updated, %s same dir hits, running for %s ###',
                threadnum, n, round(n / (time.time() - starttime), 2), c, s, t)
            t = time.time()
        # break if dirlist is None
        if dirlist is None:
            break
        else:
            # random pick from dirlist
            i = len(dirlist) - 1
            li = randint(0, i)
            path = dirlist[li][1]
            mtime_utc = dirlist[li][2]
        # pick a new path if same as last time
        if path == last_path:
            s += 1
            continue
        last_path = path
        # check directory's mtime on disk
        try:
            mtime_now_utc = time.mktime(time.gmtime(os.lstat(path).st_mtime))
        except (IOError, OSError):
            if cliargs['verbose']:
                logger.info('Error crawling directory %s' % path)
            continue
        if (mtime_now_utc == mtime_utc):
            if cliargs['verbose']:
                logger.info('Mtime unchanged: %s' % path)
        else:
            c += 1
            logger.info('*** Mtime changed! Reindexing: %s' % path)
            # delete existing path docs (non-recursive)
            reindex_dict = diskover.index_delete_path(path, cliargs, logger,
                                                      reindex_dict)
            # start crawling
            diskover.crawl_tree(path, cliargs, logger, mpq, mpq_lock,
                                totaljobs, reindex_dict)
            # calculate directory size for path
            diskover.calc_dir_sizes(cliargs, logger, path=path)
        time.sleep(diskover.config['botsleep'])
        n += 1
Ejemplo n.º 3
0
def run_command(threadnum, command_dict, clientsock, cliargs, logger):
    """This is the run command function.
    It runs commands from the listener socket
    using values in command_dict.
    """
    global socket_tasks
    global clientlist

    # try to get index name from command or use from diskover config file
    try:
        index = str(command_dict['index'])
    except KeyError:
        index = str(config['index'])
        pass
    # try to get min days mtime from command or use default
    try:
        mtime = str(command_dict['mtime'])
    except KeyError:
        mtime = str(cliargs['mtime'])
        pass
    # try to get min size from command or use default
    try:
        minsize = str(command_dict['minsize'])
    except KeyError:
        minsize = str(cliargs['minsize'])
        pass
    # try to get worker batch size from command or use default
    try:
        batchsize = str(command_dict['batchsize'])
    except KeyError:
        batchsize = str(cliargs['batchsize'])
        pass
    # try to get adaptive batch option from command or use default
    try:
        adaptivebatch = str(command_dict['adaptivebatch'])
    except KeyError:
        adaptivebatch = str(cliargs['adaptivebatch'])
        pass
    # try to get optimize index option from command or use default
    try:
        optimizeindex = str(command_dict['optimizeindex'])
    except KeyError:
        optimizeindex = str(cliargs['optimizeindex'])
        pass
    # try to get auto tag option from command or use default
    try:
        autotag = str(command_dict['autotag'])
    except KeyError:
        autotag = str(cliargs['autotag'])
        pass
    # try to get empty dirs option from command or use default
    try:
        indexemptydirs = str(command_dict['indexemptydirs'])
    except KeyError:
        indexemptydirs = str(cliargs['indexemptydirs'])
        pass

    try:
        action = command_dict['action']
        pythonpath = config['python_path']
        diskoverpath = config['diskover_path']

        # set up command for different action
        if action == 'crawl':
            path = command_dict['path']
            cmd = [
                pythonpath, diskoverpath, '-b', batchsize, '-i', index, '-d',
                path, '-m', mtime, '-s', minsize, '-q', '-F'
            ]

        elif action == 'finddupes':
            cmd = [
                pythonpath, diskoverpath, '-b', batchsize, '-i', index,
                '--finddupes', '-q', '-F'
            ]

        elif action == 'hotdirs':
            index2 = str(command_dict['index2'])
            cmd = [
                pythonpath, diskoverpath, '-b', batchsize, '-i', index,
                '--hotdirs', index2, '-q', '-F'
            ]

        elif action == 'reindex':
            try:
                recursive = command_dict['recursive']
            except KeyError:
                recursive = 'false'
                pass
            path = command_dict['path']
            if recursive == 'true':
                cmd = [
                    pythonpath, diskoverpath, '-b', batchsize, '-i', index,
                    '-d', path, '-R', '-q', '-F'
                ]
            else:
                cmd = [
                    pythonpath, diskoverpath, '-b', batchsize, '-i', index,
                    '-d', path, '-r', '-q', '-F'
                ]

        elif action == 'updatedirsizes':
            try:
                recursive = command_dict['recursive']
            except KeyError:
                recursive = 'false'
                pass
            if recursive == 'true':
                cmd = [
                    pythonpath, diskoverpath, '-b', batchsize, '-i', index,
                    '--dircalcsonly', '-q', '-F'
                ]
            else:
                path = command_dict['path']
                cmd = [
                    pythonpath, diskoverpath, '-b', batchsize, '-i', index,
                    '-d', path, '--dircalcsonly', '--maxdcdepth', '0', '-q',
                    '-F'
                ]

        elif action == 'kill':
            taskid = command_dict['taskid']
            logger.info("[thread-%s]: Kill task message received! (taskid:%s)",
                        threadnum, taskid)
            # do something here to kill task (future)
            message = b'{"msg": "taskkilled"}\n'
            clientsock.send(message)
            return

        else:
            logger.warning("Unknown action")
            message = b'{"error": "unknown action"}\n'
            clientsock.send(message)
            return

        # add adaptive batch
        if (adaptivebatch == "True" or adaptivebatch == "true"):
            cmd.append('-a')

        # add optimize index
        if (optimizeindex == "True" or optimizeindex == "true"):
            cmd.append('-O')

        # add auto tags
        if (autotag == "True" or autotag == "true"):
            cmd.append('-A')

        # add index empty dirs
        if (indexemptydirs == "True" or indexemptydirs == "true"):
            cmd.append('-e')

        # run command using subprocess
        starttime = time.time()
        taskid = str(uuid.uuid4()).encode('utf-8')

        # start process
        process = subprocess.Popen(cmd,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)

        # add process to socket_tasks dict
        socket_tasks[taskid] = process

        message = b'{"msg": "taskstart", "taskid": "' + taskid + b'"}\n'
        clientsock.send(message)

        logger.info("[thread-%s]: Running command (taskid:%s)", threadnum,
                    taskid.decode('utf-8'))
        logger.info(cmd)

        output, error = process.communicate()

        # send exit msg to client
        exitcode = str(process.returncode).encode('utf-8')
        logger.debug('Command output:')
        logger.debug(output.decode('utf-8'))
        logger.debug('Command error:')
        logger.debug(error.decode('utf-8'))
        elapsedtime = str(get_time(time.time() - starttime)).encode('utf-8')
        logger.info(
            "Finished command (taskid:%s), exit code: %s, elapsed time: %s" %
            (taskid.decode('utf-8'), exitcode.decode('utf-8'),
             elapsedtime.decode('utf-8')))
        message = b'{"msg": "taskfinish", "taskid": "%s", "exitcode": %s, "elapsedtime": "%s"}\n' \
                  % (taskid, exitcode, elapsedtime)
        clientsock.send(message)

    except ValueError:
        logger.warning("Value error")
        message = b'{"error": "value error"}\n'
        clientsock.send(message)
        pass

    except socket.error as e:
        logger.error("[thread-%s]: Socket error (%s)" % (threadnum, e))
        pass
Ejemplo n.º 4
0
def run_command(threadnum, command_dict, clientsock, cliargs, logger, verbose):
    """This is the run command function.
    It runs commands from the listener socket
    using values in command_dict.
    """
    global socket_tasks
    global clientlist

    # try to get index name from command or use from diskover config file
    try:
        index = command_dict['index']
    except KeyError:
        index = diskover.config['index']
        pass
    # try to get worker batch size from command or use default
    try:
        batchsize = str(command_dict['batchsize'])
    except KeyError:
        batchsize = str(cliargs['batchsize'])
        pass

    try:
        action = command_dict['action']
        pythonpath = diskover.config['python_path']
        diskoverpath = diskover.config['diskover_path']

        # set up command for different action
        if action == 'crawl':
            path = command_dict['path']
            cmd = [
                pythonpath, '-u', diskoverpath, '-b', batchsize, '-i', index,
                '-d', path, '-q'
            ]

        elif action == 'finddupes':
            cmd = [
                pythonpath, '-u', diskoverpath, '-b', batchsize, '-i', index,
                '-D', '-q'
            ]

        elif action == 'reindex':
            try:
                recursive = command_dict['recursive']
            except KeyError:
                recursive = 'false'
                pass
            path = command_dict['path']
            if recursive == 'true':
                cmd = [
                    pythonpath, '-u', diskoverpath, '-b', batchsize, '-i',
                    index, '-d', path, '-R', '-q'
                ]
            else:
                cmd = [
                    pythonpath, '-u', diskoverpath, '-b', batchsize, '-i',
                    index, '-d', path, '-r', '-q'
                ]

        elif action == 'kill':
            taskid = command_dict['taskid']
            logger.info("[thread-%s]: Kill task message received! (taskid:%s)",
                        threadnum, taskid)
            # do something here to kill task (future)
            message = b'{"msg": "taskkilled"}\n'
            clientsock.send(message)
            return

        else:
            logger.warning("Unknown action")
            message = b'{"error": "unknown action"}\n'
            clientsock.send(message)
            return

        # run command using subprocess
        starttime = time.time()
        taskid = str(uuid.uuid4()).encode('utf-8')

        # start process
        process = subprocess.Popen(cmd,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)

        # add process to socket_tasks dict
        socket_tasks[taskid] = process

        message = b'{"msg": "taskstart", "taskid": "' + taskid + b'"}\n'
        clientsock.send(message)

        logger.info("[thread-%s]: Running command (taskid:%s)", threadnum,
                    taskid.decode('utf-8'))
        logger.info(cmd)

        output, error = process.communicate()

        # send exit msg to client
        exitcode = str(process.returncode).encode('utf-8')
        logger.debug('Command output:')
        logger.debug(output.decode('utf-8'))
        logger.debug('Command error:')
        logger.debug(error.decode('utf-8'))
        elapsedtime = str(diskover.get_time(time.time() -
                                            starttime)).encode('utf-8')
        logger.info(
            "Finished command (taskid:%s), exit code: %s, elapsed time: %s" %
            (taskid.decode('utf-8'), exitcode.decode('utf-8'),
             elapsedtime.decode('utf-8')))
        message = b'{"msg": "taskfinish", "taskid": "%s", "exitcode": %s, "elapsedtime": "%s"}\n' \
                  % (taskid, exitcode, elapsedtime)
        clientsock.send(message)

    except ValueError:
        logger.warning("Value error")
        message = b'{"error": "value error"}\n'
        clientsock.send(message)
        pass

    except socket.error as e:
        logger.error("[thread-%s]: Socket error (%s)" % (threadnum, e))
        pass