Ejemplo n.º 1
0
class Scheduler(object):
    """Tasks Scheduler.

    This class is responsible for the main execution loop of the tool. It
    prepares the analysis machines and keep waiting and loading for new
    analysis tasks.
    Whenever a new task is available, it launches AnalysisManager which will
    take care of running the full analysis process and operating with the
    assigned analysis machine.
    """
    def __init__(self, maxcount=None):
        self.running = True
        self.cfg = Config()
        self.db = Database()
        self.maxcount = maxcount
        self.total_analysis_count = 0
        self.analysis_managers = set()

    def initialize(self):
        """Initialize the machine manager."""
        global machinery, machine_lock

        machinery_name = self.cfg.cuckoo.machinery

        max_vmstartup_count = self.cfg.cuckoo.max_vmstartup_count
        if max_vmstartup_count:
            machine_lock = threading.Semaphore(max_vmstartup_count)
        else:
            machine_lock = threading.Lock()

        log.info("Using \"%s\" as machine manager",
                 machinery_name,
                 extra={
                     "action": "init.machinery",
                     "status": "success",
                     "machinery": machinery_name,
                 })

        # Initialize the machine manager.
        machinery = cuckoo.machinery.plugins[machinery_name]()

        # Provide a dictionary with the configuration options to the
        # machine manager instance.
        machinery.set_options(Config(machinery_name))

        # Initialize the machine manager.
        try:
            machinery.initialize(machinery_name)
        except CuckooMachineError as e:
            raise CuckooCriticalError("Error initializing machines: %s" % e)

        # At this point all the available machines should have been identified
        # and added to the list. If none were found, Cuckoo aborts the
        # execution. TODO In the future we'll probably want get rid of this.
        if not machinery.machines():
            raise CuckooCriticalError("No machines available.")

        log.info("Loaded %s machine/s",
                 len(machinery.machines()),
                 extra={
                     "action": "init.machines",
                     "status": "success",
                     "count": len(machinery.machines()),
                 })

        if len(machinery.machines()) > 1 and self.db.engine.name == "sqlite":
            log.warning("As you've configured Cuckoo to execute parallel "
                        "analyses, we recommend you to switch to a MySQL or "
                        "a PostgreSQL database as SQLite might cause some "
                        "issues.")

        if len(machinery.machines()) > 4 and self.cfg.cuckoo.process_results:
            log.warning("When running many virtual machines it is recommended "
                        "to process the results in separate 'cuckoo process' "
                        "instances to increase throughput and stability. "
                        "Please read the documentation about the "
                        "`Processing Utility`.")

        # Drop all existing packet forwarding rules for each VM. Just in case
        # Cuckoo was terminated for some reason and various forwarding rules
        # have thus not been dropped yet.
        for machine in machinery.machines():
            if not machine.interface:
                log.info(
                    "Unable to determine the network interface for VM "
                    "with name %s, Cuckoo will not be able to give it "
                    "full internet access or route it through a VPN! "
                    "Please define a default network interface for the "
                    "machinery or define a network interface for each "
                    "VM.", machine.name)
                continue

            # Drop forwarding rule to each VPN.
            if config("routing:vpn:enabled"):
                for vpn in config("routing:vpn:vpns"):
                    rooter("forward_disable", machine.interface,
                           config("routing:%s:interface" % vpn), machine.ip)

            # Drop forwarding rule to the internet / dirty line.
            if config("routing:routing:internet") != "none":
                rooter("forward_disable", machine.interface,
                       config("routing:routing:internet"), machine.ip)

    def stop(self):
        """Stop scheduler."""
        self.running = False

        # Force stop all analysis managers.
        for am in self.analysis_managers:
            try:
                am.force_stop()
            except Exception as e:
                log.exception("Error force stopping analysis manager: %s", e)

        # Shutdown machine manager (used to kill machines that still alive).
        machinery.shutdown()

        # Remove network rules if any are present and stop auxiliary modules
        for am in self.analysis_managers:
            try:
                am.cleanup()
            except Exception as e:
                log.exception("Error while cleaning up analysis manager: %s",
                              e)

    def _cleanup_managers(self):
        cleaned = set()
        for am in self.analysis_managers:
            if not am.isAlive():
                try:
                    am.cleanup()
                except Exception as e:
                    log.exception("Error in analysis manager cleanup: %s", e)

                cleaned.add(am)
        return cleaned

    def start(self):
        """Start scheduler."""
        self.initialize()

        log.info("Waiting for analysis tasks.")

        # Message queue with threads to transmit exceptions (used as IPC).
        errors = Queue.Queue()

        # Command-line overrides the configuration file.
        if self.maxcount is None:
            self.maxcount = self.cfg.cuckoo.max_analysis_count

        launchedAnalysis = True

        # This loop runs forever.
        while self.running:
            if not launchedAnalysis:
                time.sleep(1)

            launchedAnalysis = False

            # Run cleanup on finished analysis managers and untrack them
            for am in self._cleanup_managers():
                self.analysis_managers.discard(am)

            # Wait until the machine lock is not locked. This is only the case
            # when all machines are fully running, rather that about to start
            # or still busy starting. This way we won't have race conditions
            # with finding out there are no available machines in the analysis
            # manager or having two analyses pick the same machine.
            if not machine_lock.acquire(False):
                logger("Could not acquire machine lock",
                       action="scheduler.machine_lock",
                       status="busy")
                continue

            machine_lock.release()

            # If not enough free disk space is available, then we print an
            # error message and wait another round (this check is ignored
            # when the freespace configuration variable is set to zero).
            if self.cfg.cuckoo.freespace:
                # Resolve the full base path to the analysis folder, just in
                # case somebody decides to make a symbolic link out of it.
                dir_path = cwd("storage", "analyses")

                # TODO: Windows support
                if hasattr(os, "statvfs"):
                    dir_stats = os.statvfs(dir_path.encode("utf8"))

                    # Calculate the free disk space in megabytes.
                    space_available = dir_stats.f_bavail * dir_stats.f_frsize
                    space_available /= 1024 * 1024

                    if space_available < self.cfg.cuckoo.freespace:
                        log.error("Not enough free disk space! (Only %d MB!)",
                                  space_available,
                                  extra={
                                      "action": "scheduler.diskspace",
                                      "status": "error",
                                      "available": space_available,
                                  })
                        continue

            # If we have limited the number of concurrently executing machines,
            # are we currently at the maximum?
            maxvm = self.cfg.cuckoo.max_machines_count
            if maxvm and len(machinery.running()) >= maxvm:
                logger("Already maxed out on running machines",
                       action="scheduler.machines",
                       status="maxed")
                continue

            # If no machines are available, it's pointless to fetch for
            # pending tasks. Loop over.
            if not machinery.availables():
                logger("No available machines",
                       action="scheduler.machines",
                       status="none")
                continue

            # Exits if max_analysis_count is defined in the configuration
            # file and has been reached.
            if self.maxcount and self.total_analysis_count >= self.maxcount:
                if active_analysis_count <= 0:
                    log.debug("Reached max analysis count, exiting.",
                              extra={
                                  "action": "scheduler.max_analysis",
                                  "status": "success",
                                  "limit": self.total_analysis_count,
                              })
                    self.stop()
                else:
                    logger(
                        "Maximum analyses hit, awaiting active to finish off",
                        action="scheduler.max_analysis",
                        status="busy",
                        active=active_analysis_count)
                continue

            # Fetch a pending analysis task.
            # TODO This fixes only submissions by --machine, need to add
            # other attributes (tags etc).
            # TODO We should probably move the entire "acquire machine" logic
            # from the Analysis Manager to the Scheduler and then pass the
            # selected machine onto the Analysis Manager instance.
            task, available = None, False
            for machine in self.db.get_available_machines():
                task = self.db.fetch(machine=machine.name)
                if task:
                    break

                if machine.is_analysis():
                    available = True

            # We only fetch a new task if at least one of the available
            # machines is not a "service" machine (again, please refer to the
            # services auxiliary module for more information on service VMs).
            if not task and available:
                task = self.db.fetch(service=False)

            if task:
                start = time.clock()
                self.total_analysis_count += 1

                # Initialize and start the analysis manager.
                analysis = AnalysisManager(task.id, errors)
                analysis.daemon = True
                analysis.start()
                self.analysis_managers.add(analysis)
                launchedAnalysis = True
                log.debug("Processing task #%s Call Duration %ds", task.id,
                          time.clock() - start)

            # Deal with errors.
            try:
                raise errors.get(block=False)
            except Queue.Empty:
                pass

        log.debug("End of analyses.")
Ejemplo n.º 2
0
class Scheduler(object):
    """Tasks Scheduler.

    This class is responsible for the main execution loop of the tool. It
    prepares the analysis machines and keep waiting and loading for new
    analysis tasks.
    Whenever a new task is available, it launches AnalysisManager which will
    take care of running the full analysis process and operating with the
    assigned analysis machine.
    """
    def __init__(self, maxcount=None):
        self.running = True
        self.cfg = Config()
        self.db = Database()
        self.maxcount = maxcount
        self.total_analysis_count = 0

    def initialize(self):
        """Initialize the machine manager."""
        global machinery, machine_lock

        machinery_name = self.cfg.cuckoo.machinery

        max_vmstartup_count = self.cfg.cuckoo.max_vmstartup_count
        if max_vmstartup_count:
            machine_lock = threading.Semaphore(max_vmstartup_count)
        else:
            machine_lock = threading.Lock()

        log.info("Using \"%s\" as machine manager", machinery_name, extra={
            "action": "init.machinery",
            "status": "success",
            "machinery": machinery_name,
        })

        # Initialize the machine manager.
        machinery = cuckoo.machinery.plugins[machinery_name]()

        # Provide a dictionary with the configuration options to the
        # machine manager instance.
        machinery.set_options(Config(machinery_name))

        # Initialize the machine manager.
        try:
            machinery.initialize(machinery_name)
        except CuckooMachineError as e:
            raise CuckooCriticalError("Error initializing machines: %s" % e)

        # At this point all the available machines should have been identified
        # and added to the list. If none were found, Cuckoo aborts the
        # execution. TODO In the future we'll probably want get rid of this.
        if not machinery.machines():
            raise CuckooCriticalError("No machines available.")

        log.info("Loaded %s machine/s", len(machinery.machines()), extra={
            "action": "init.machines",
            "status": "success",
            "count": len(machinery.machines()),
        })

        if len(machinery.machines()) > 1 and self.db.engine.name == "sqlite":
            log.warning("As you've configured Cuckoo to execute parallel "
                        "analyses, we recommend you to switch to a MySQL or "
                        "a PostgreSQL database as SQLite might cause some "
                        "issues.")

        if len(machinery.machines()) > 4 and self.cfg.cuckoo.process_results:
            log.warning("When running many virtual machines it is recommended "
                        "to process the results in separate 'cuckoo process' "
                        "instances to increase throughput and stability. "
                        "Please read the documentation about the "
                        "`Processing Utility`.")

        # Drop all existing packet forwarding rules for each VM. Just in case
        # Cuckoo was terminated for some reason and various forwarding rules
        # have thus not been dropped yet.
        for machine in machinery.machines():
            if not machine.interface:
                log.info("Unable to determine the network interface for VM "
                         "with name %s, Cuckoo will not be able to give it "
                         "full internet access or route it through a VPN! "
                         "Please define a default network interface for the "
                         "machinery or define a network interface for each "
                         "VM.", machine.name)
                continue

            # Drop forwarding rule to each VPN.
            if config("routing:vpn:enabled"):
                for vpn in config("routing:vpn:vpns"):
                    rooter(
                        "forward_disable", machine.interface,
                        config("routing:%s:interface" % vpn), machine.ip
                    )

            # Drop forwarding rule to the internet / dirty line.
            if config("routing:routing:internet") != "none":
                rooter(
                    "forward_disable", machine.interface,
                    config("routing:routing:internet"), machine.ip
                )

    def stop(self):
        """Stop scheduler."""
        self.running = False
        # Shutdown machine manager (used to kill machines that still alive).
        machinery.shutdown()

    def start(self):
        """Start scheduler."""
        self.initialize()

        log.info("Waiting for analysis tasks.")

        # Message queue with threads to transmit exceptions (used as IPC).
        errors = Queue.Queue()

        # Command-line overrides the configuration file.
        if self.maxcount is None:
            self.maxcount = self.cfg.cuckoo.max_analysis_count

        # This loop runs forever.
        while self.running:
            time.sleep(1)

            # Wait until the machine lock is not locked. This is only the case
            # when all machines are fully running, rather that about to start
            # or still busy starting. This way we won't have race conditions
            # with finding out there are no available machines in the analysis
            # manager or having two analyses pick the same machine.
            if not machine_lock.acquire(False):
                logger(
                    "Could not acquire machine lock",
                    action="scheduler.machine_lock", status="busy"
                )
                continue

            machine_lock.release()

            # If not enough free disk space is available, then we print an
            # error message and wait another round (this check is ignored
            # when the freespace configuration variable is set to zero).
            if self.cfg.cuckoo.freespace:
                # Resolve the full base path to the analysis folder, just in
                # case somebody decides to make a symbolic link out of it.
                dir_path = cwd("storage", "analyses")

                # TODO: Windows support
                if hasattr(os, "statvfs"):
                    dir_stats = os.statvfs(dir_path.encode("utf8"))

                    # Calculate the free disk space in megabytes.
                    space_available = dir_stats.f_bavail * dir_stats.f_frsize
                    space_available /= 1024 * 1024

                    if space_available < self.cfg.cuckoo.freespace:
                        log.error(
                            "Not enough free disk space! (Only %d MB!)",
                            space_available, extra={
                                "action": "scheduler.diskspace",
                                "status": "error",
                                "available": space_available,
                            }
                        )
                        continue

            # If we have limited the number of concurrently executing machines,
            # are we currently at the maximum?
            maxvm = self.cfg.cuckoo.max_machines_count
            if maxvm and len(machinery.running()) >= maxvm:
                logger(
                    "Already maxed out on running machines",
                    action="scheduler.machines", status="maxed"
                )
                continue

            # If no machines are available, it's pointless to fetch for
            # pending tasks. Loop over.
            if not machinery.availables():
                logger(
                    "No available machines",
                    action="scheduler.machines", status="none"
                )
                continue

            # Exits if max_analysis_count is defined in the configuration
            # file and has been reached.
            if self.maxcount and self.total_analysis_count >= self.maxcount:
                if active_analysis_count <= 0:
                    log.debug("Reached max analysis count, exiting.", extra={
                        "action": "scheduler.max_analysis",
                        "status": "success",
                        "limit": self.total_analysis_count,
                    })
                    self.stop()
                else:
                    logger(
                        "Maximum analyses hit, awaiting active to finish off",
                        action="scheduler.max_analysis", status="busy",
                        active=active_analysis_count
                    )
                continue

            # Fetch a pending analysis task.
            # TODO This fixes only submissions by --machine, need to add
            # other attributes (tags etc).
            # TODO We should probably move the entire "acquire machine" logic
            # from the Analysis Manager to the Scheduler and then pass the
            # selected machine onto the Analysis Manager instance.
            task, available = None, False
            for machine in self.db.get_available_machines():
                task = self.db.fetch(machine=machine.name)
                if task:
                    break

                if machine.is_analysis():
                    available = True

            # We only fetch a new task if at least one of the available
            # machines is not a "service" machine (again, please refer to the
            # services auxiliary module for more information on service VMs).
            if not task and available:
                task = self.db.fetch(service=False)

            if task:
                log.debug("Processing task #%s", task.id)
                self.total_analysis_count += 1

                # Initialize and start the analysis manager.
                analysis = AnalysisManager(task.id, errors)
                analysis.daemon = True
                analysis.start()

            # Deal with errors.
            try:
                raise errors.get(block=False)
            except Queue.Empty:
                pass

        log.debug("End of analyses.")
Ejemplo n.º 3
0
class DatabaseEngine(object):
    """Tests database stuff."""
    URI = None

    def setup_class(self):
        set_cwd(tempfile.mkdtemp())

    def setup(self):
        self.d = Database()
        self.d.connect(dsn=self.URI)

    def teardown(self):
        # Clear all tables without dropping them
        # This is done after each test to ensure a test doesn't fail because
        # of data of a previous test
        meta = MetaData()
        meta.reflect(self.d.engine)
        ses = self.d.Session()
        try:
            for t in reversed(meta.sorted_tables):
                ses.execute(t.delete())
            ses.commit()
        finally:
            ses.close()

    def test_add_target(self):
        count = self.d.Session().query(Target).count()
        add_target("http://example.com", category="url")
        assert self.d.Session().query(Target).count() == count + 1

    def test_add_task(self):
        fd, sample_path = tempfile.mkstemp()
        os.write(fd, "hehe")
        os.close(fd)

        # Add task.
        count = self.d.Session().query(Task).count()
        add_task(sample_path, category="file")
        assert self.d.Session().query(Task).count() == count + 1

    def test_processing_get_task(self):
        # First reset all existing rows so that earlier exceptions don't affect
        # this unit test run.
        null, session = None, self.d.Session()

        session.query(Task).filter(Task.status == "completed",
                                   Task.processing == null).update({
                                       "processing":
                                       "something",
                                   })
        session.commit()

        t1 = add_task("http://google.com/1",
                      priority=1,
                      status="completed",
                      category="url")
        t2 = add_task("http://google.com/2",
                      priority=2,
                      status="completed",
                      category="url")
        t3 = add_task("http://google.com/3",
                      priority=1,
                      status="completed",
                      category="url")
        t4 = add_task("http://google.com/4",
                      priority=1,
                      status="completed",
                      category="url")
        t5 = add_task("http://google.com/5",
                      priority=3,
                      status="completed",
                      category="url")
        t6 = add_task("http://google.com/6",
                      priority=1,
                      status="completed",
                      category="url")
        t7 = add_task("http://google.com/7",
                      priority=1,
                      status="completed",
                      category="url")

        assert self.d.processing_get_task("foo") == t5
        assert self.d.processing_get_task("foo") == t2
        assert self.d.processing_get_task("foo") == t1
        assert self.d.processing_get_task("foo") == t3
        assert self.d.processing_get_task("foo") == t4
        assert self.d.processing_get_task("foo") == t6
        assert self.d.processing_get_task("foo") == t7
        assert self.d.processing_get_task("foo") is None

    def test_error_exists(self):
        task_id = add_task("http://google.com/7", category="url")
        self.d.add_error("A" * 1024, task_id)
        assert len(self.d.view_errors(task_id)) == 1
        self.d.add_error("A" * 1024, task_id)
        assert len(self.d.view_errors(task_id)) == 2

    def test_long_error(self):
        add_task("http://google.com/", category="url")
        self.d.add_error("A" * 1024, 1)
        err = self.d.view_errors(1)
        assert err and len(err[0].message) == 1024

    def test_submit(self):
        dirpath = tempfile.mkdtemp()
        submit_id = self.d.add_submit(dirpath, "files", {
            "foo": "bar",
        })
        submit = self.d.view_submit(submit_id)
        assert submit.id == submit_id
        assert submit.tmp_path == dirpath
        assert submit.submit_type == "files"
        assert submit.data == {
            "foo": "bar",
        }

    def test_connect_no_create(self):
        AlembicVersion.__table__.drop(self.d.engine)
        self.d.connect(dsn=self.URI, create=False)
        assert "alembic_version" not in self.d.engine.table_names()
        self.d.connect(dsn=self.URI)
        assert "alembic_version" in self.d.engine.table_names()

    def test_view_submit_tasks(self):
        submit_id = self.d.add_submit(None, None, None)
        target_id = add_target(__file__, category="file")
        t1 = add_task(custom="1", submit_id=submit_id)
        t2 = add_task(custom="2", submit_id=submit_id)

        submit = self.d.view_submit(submit_id)
        assert submit.id == submit_id
        with pytest.raises(DetachedInstanceError):
            print submit.tasks

        submit = self.d.view_submit(submit_id, tasks=True)
        assert len(submit.tasks) == 2
        tasks = sorted((task.id, task) for task in submit.tasks)
        assert tasks[0][1].id == t1
        assert tasks[0][1].custom == "1"
        assert tasks[1][1].id == t2
        assert tasks[1][1].custom == "2"

    def test_task_set_options(self):
        t0 = add_task(__file__, options={"foo": "bar"})
        t1 = add_task(__file__, options="foo=bar")

        assert self.d.view_task(t0).options == {"foo": "bar"}
        assert self.d.view_task(t1).options == {"foo": "bar"}

    def test_error_action(self):
        task_id = add_task(__file__)
        self.d.add_error("message1", task_id)
        self.d.add_error("message2", task_id, "actionhere")
        e1, e2 = self.d.view_errors(task_id)
        assert e1.message == "message1"
        assert e1.action is None
        assert e2.message == "message2"
        assert e2.action == "actionhere"

    def test_view_tasks(self):
        t1 = add_task(__file__)
        t2 = add_task("http://example.com", category="url")
        tasks = self.d.view_tasks([t1, t2])
        assert tasks[0].to_dict() == self.d.view_task(t1).to_dict()
        assert tasks[1].to_dict() == self.d.view_task(t2).to_dict()

    def test_add_machine(self):
        self.d.add_machine("name1", "label", "1.2.3.4", "windows", None,
                           "tag1 tag2", "int0", "snap0", "5.6.7.8", 2043,
                           "virtualbox")
        self.d.add_machine("name2", "label", "1.2.3.4", "windows", "",
                           "tag1 tag2", "int0", "snap0", "5.6.7.8", 2043,
                           "virtualbox")
        self.d.add_machine("name3", "label", "1.2.3.4", "windows", "opt1 opt2",
                           "tag1 tag2", "int0", "snap0", "5.6.7.8", 2043,
                           "virtualbox")
        self.d.add_machine("name4",
                           "label",
                           "1.2.3.4",
                           "windows", ["opt3", "opt4"],
                           "tag1 tag2",
                           "int0",
                           "snap0",
                           "5.6.7.8",
                           2043,
                           "virtualbox",
                           reserved_by=1600)
        m1 = self.d.view_machine("name1")
        m2 = self.d.view_machine("name2")
        m3 = self.d.view_machine("name3")
        m4 = self.d.view_machine("name4")
        assert m1.options == []
        assert m2.options == []
        assert m3.options == ["opt1", "opt2"]
        assert m4.options == ["opt3", "opt4"]
        assert m1.manager == "virtualbox"
        assert m4.reserved_by == 1600

    def test_adding_task(self):
        now = datetime.datetime.now()
        id = add_task(__file__, "file", 0, "py", "free=yes", 3, "custom",
                      "owner", "machine1", "DogeOS", ["tag1"], False, False,
                      now, "regular", None, now)

        task = self.d.view_task(id)
        assert id is not None
        assert task.timeout == 0
        assert task.package == "py"
        assert task.options == {"free": "yes"}
        assert task.priority == 3
        assert task.custom == "custom"
        assert task.owner == "owner"
        assert task.machine == "machine1"
        assert task.platform == "DogeOS"
        assert len(task.tags) == 1
        assert task.tags[0].name == "tag1"
        assert task.memory == False
        assert task.enforce_timeout == False
        assert task.clock == now
        assert task.submit_id is None
        assert task.start_on == now
        assert len(task.targets) == 1
        assert task.targets[0].category == "file"
        assert task.targets[0].target == __file__

    def test_set_machine_rcparams(self):
        self.d.add_machine("name5", "label5", "1.2.3.4", "windows", None,
                           "tag1 tag2", "int0", "snap0", "5.6.7.8", 2043,
                           "virtualbox")

        self.d.set_machine_rcparams("label5", {
            "protocol": "rdp",
            "host": "127.0.0.1",
            "port": 3389,
        })

        m = self.d.view_machine("name5")
        assert m.rcparams == {
            "protocol": "rdp",
            "host": "127.0.0.1",
            "port": "3389",
        }

    def test_add_target_file(self):
        fd, sample_path = tempfile.mkstemp()
        os.write(fd, os.urandom(64))
        os.close(fd)
        target = File(sample_path)

        id = add_target(sample_path, "file")
        db_target = self.d.find_target(id=id)

        assert id is not None
        assert db_target.file_size == 64
        assert db_target.file_type == target.get_type()
        assert db_target.md5 == target.get_md5()
        assert db_target.crc32 == target.get_crc32()
        assert db_target.sha1 == target.get_sha1()
        assert db_target.sha256 == target.get_sha256()
        assert db_target.sha512 == target.get_sha512()
        assert db_target.ssdeep == target.get_ssdeep()
        assert db_target.category == "file"

    def test_add_target_url(self):
        target = URL("http://example.com/")

        id = add_target(target.url, "url")
        db_target = self.d.find_target(id=id)

        assert id is not None
        assert db_target.md5 == target.get_md5()
        assert db_target.crc32 == target.get_crc32()
        assert db_target.sha1 == target.get_sha1()
        assert db_target.sha256 == target.get_sha256()
        assert db_target.sha512 == target.get_sha512()
        assert db_target.ssdeep == target.get_ssdeep()
        assert db_target.category == "url"

    def test_find_target(self):
        fd, sample_path = tempfile.mkstemp()
        os.write(fd, os.urandom(64))
        os.close(fd)
        target = File(sample_path)
        id = add_target(sample_path, category="file")

        assert self.d.find_target(id=id).id == id
        assert self.d.find_target(crc32=target.get_crc32()).id == id
        assert self.d.find_target(md5=target.get_md5()).id == id
        assert self.d.find_target(sha1=target.get_sha1()).id == id
        assert self.d.find_target(sha256=target.get_sha256()).id == id
        assert self.d.find_target(sha512=target.get_sha512()).id == id

    def test_find_target_multifilter(self):
        ids = []
        paths = []
        target = None
        for x in range(2):
            fd, sample_path = tempfile.mkstemp()
            randbytes = os.urandom(64)
            paths.append(sample_path)
            os.write(fd, randbytes)
            os.close(fd)
            target = File(sample_path)
            ids.append(add_target(sample_path, category="file"))

        db_target = self.d.find_target(sha256=target.get_sha256(),
                                       target=paths[1])
        assert self.d.find_target(id=ids[0], md5=target.get_md5()) is None
        assert db_target.id == ids[1]

    def test_fetch_with_machine(self):
        future = datetime.datetime(2200, 5, 12, 12, 12)
        add_task(__file__, category="file", tags=["service"])
        t2 = add_task(__file__, category="file", machine="machine1")
        add_task(__file__, category="file", start_on=future)
        add_task(__file__, category="file")

        t = self.d.fetch(machine="machine1", service=False)

        assert t.id == t2
        assert t.status == "pending"

    def test_fetch_service_false(self):
        add_task(__file__, category="file", tags=["service"])
        t2 = add_task(__file__, category="file")

        t = self.d.fetch(service=False)
        assert t.id == t2
        assert t.status == "pending"

    def test_fetch_service_true(self):
        t1 = add_task(__file__, category="file", tags=["service"])
        add_task(__file__, category="file", machine="machine1")
        add_task(__file__)
        add_task(__file__)

        task = self.d.fetch()
        assert task.id == t1
        assert task.status == "pending"

    def test_fetch_use_start_on_true(self):
        future = datetime.datetime(2200, 5, 12, 12, 12)
        add_task(__file__, category="file", start_on=future, priority=999)
        t2 = add_task(__file__, category="file")
        t = self.d.fetch(service=False)

        assert t.id == t2
        assert t.status == "pending"

    def test_fetch_use_start_on_false(self):
        future = datetime.datetime(2200, 5, 12, 12, 12)
        t1 = add_task(__file__, category="file", start_on=future, priority=999)
        add_task(__file__, category="file")

        t = self.d.fetch(use_start_on=False, service=False)
        assert t.id == t1
        assert t.status == "pending"

    def test_fetch_use_exclude(self):

        t1 = add_task(__file__, category="file", priority=999)
        t2 = add_task(__file__, category="file", priority=999)
        t3 = add_task(__file__, category="file", priority=999)
        t4 = add_task(__file__, category="file", priority=999)

        t = self.d.fetch(service=False, exclude=[t1, t2, t3])
        assert t.id == t4
        assert t.status == "pending"

    def test_fetch_specific_task(self):
        t1 = add_task(__file__, category="file", priority=999)
        t2 = add_task(__file__, category="file", priority=999)
        t = self.d.fetch(task_id=t1)
        assert t.id == t1
        assert t.status == "pending"

    def test_lock_machine(self):
        t1 = add_task(__file__, category="file", tags=["app1", "office7"])
        t2 = add_task(__file__, category="file", tags=["app1", "office15"])

        self.d.add_machine("name1", "name1", "1.2.3.4", "windows", "",
                           "app1,office7", "int0", "snap0", "5.6.7.8", 2043,
                           "virtualbox")
        self.d.add_machine("name2", "name2", "1.2.3.4", "DogeOS", "opt1 opt2",
                           "office13", "int0", "snap0", "5.6.7.8", 2043,
                           "virtualbox")
        self.d.add_machine("name3", "name3", "1.2.3.4", "CoffeeOS",
                           ["opt3", "opt4"], "cofOS,office7", "int0", "snap0",
                           "5.6.7.8", 2043, "virtualbox")

        task1 = self.d.view_task(t1)
        task2 = self.d.view_task(t2)

        m1 = self.d.lock_machine(tags=task1.tags)
        assert m1.locked
        assert m1.name == "name1"
        with pytest.raises(CuckooOperationalError):
            self.d.lock_machine(platform="DogeOS", tags=task2.tags)
        m2 = self.d.lock_machine(platform="DogeOS")
        assert m2.name == "name2"
        m3 = self.d.lock_machine(label="name3")
        assert m3.locked
        assert m3.name == "name3"

    def test_list_tasks(self):
        t1 = add_task(__file__,
                      category="file",
                      owner="doge",
                      options={"route": "vpn511"})
        t2 = add_task(__file__, category="file")
        add_task(__file__, category="file")
        self.d.set_status(t2, "reported")
        self.d.set_status(t1, "reported")

        tasks = self.d.list_tasks(owner="doge", status="reported")
        tasks2 = self.d.list_tasks()
        tasks3 = self.d.list_tasks(status="reported")

        assert tasks[0].id == t1
        assert len(tasks2) == 3
        assert len(tasks3) == 2

    def test_list_tasks_between(self):
        for x in range(5):
            add_task(__file__, category="file")

        tasks = self.d.list_tasks(filter_by="id",
                                  operators="between",
                                  values=(1, 3))
        assert len(tasks) == 3

    def test_list_tasks_multiple_filter(self):
        ids = []
        future = None
        for x in range(10):
            id = add_task(__file__, category="file")
            ids.append(id)
            future = datetime.datetime.now() + datetime.timedelta(days=id)
            ses = self.d.Session()
            task = ses.query(Task).get(id)
            task.completed_on = future
            ses.commit()
            ses.close()

        tasks = self.d.list_tasks(filter_by=["id", "completed_on"],
                                  operators=[">", "<"],
                                  values=[4, future],
                                  order_by="id",
                                  limit=1)
        assert len(tasks) == 1
        assert tasks[0].id == 5

    def test_list_tasks_offset_limit(self):
        for x in range(10):
            add_task(__file__, category="file")

        tasks = self.d.list_tasks(offset=5, limit=10, order_by="id")
        assert len(tasks) == 5
        assert tasks[4].id == 10

    def test_list_tasks_notvalue(self):
        for x in range(10):
            id = add_task(__file__, category="file")
            if id % 2 == 0:
                self.d.set_status(id, "running")

        tasks = self.d.list_tasks(filter_by="status",
                                  operators="!=",
                                  values="running",
                                  order_by="id")
        assert len(tasks) == 5
        assert tasks[4].id == 9

    def test_list_tasks_noresults(self):
        for x in range(5):
            add_task(__file__, category="file")
        tasks = self.d.list_tasks(status="reported")
        assert tasks == []

    def test_get_available_machines(self):
        self.d.add_machine("name1", "name1", "1.2.3.4", "windows", "",
                           "app1,office7", "int0", "snap0", "5.6.7.8", 2043,
                           "virtualbox")
        self.d.add_machine("name2", "name2", "1.2.3.4", "DogeOS", "opt1 opt2",
                           "office13", "int0", "snap0", "5.6.7.8", 2043,
                           "virtualbox")
        self.d.add_machine("name3", "name3", "1.2.3.4", "CoffeeOS",
                           ["opt3", "opt4"], "cofOS,office7", "int0", "snap0",
                           "5.6.7.8", 2043, "virtualbox")
        self.d.machine_reserve(label="name2", task_id=1337)
        self.d.lock_machine(label="name3")
        available = self.d.get_available_machines()
        names = [m["name"] for m in [db_m.to_dict() for db_m in available]]

        assert len(available) == 2
        assert "name2" in names
        assert "name1" in names

    def test_unlock_machine(self):
        self.d.add_machine("name1", "name1", "1.2.3.4", "windows", "",
                           "app1,office7", "int0", "snap0", "5.6.7.8", 2043,
                           "virtualbox")
        self.d.lock_machine(label="name1")

        assert self.d.view_machine(name="name1").locked
        self.d.unlock_machine(label="name1")
        assert not self.d.view_machine(name="name1").locked

    def test_list_machines(self):
        self.d.add_machine("name1", "name1", "1.2.3.4", "windows", "",
                           "app1,office7", "int0", "snap0", "5.6.7.8", 2043,
                           "virtualbox")
        self.d.add_machine("name2", "name2", "1.2.3.4", "DogeOS", "opt1 opt2",
                           "office13", "int0", "snap0", "5.6.7.8", 2043,
                           "virtualbox")
        allmachines = self.d.list_machines()
        names = [m["name"] for m in [db_m.to_dict() for db_m in allmachines]]

        assert len(allmachines) == 2
        assert "name2" in names
        assert "name1" in names

    def test_machine_reserve(self):
        self.d.add_machine("name1", "name1", "1.2.3.4", "windows", "",
                           "app1,office7", "int0", "snap0", "5.6.7.8", 2043,
                           "virtualbox")
        assert self.d.view_machine(name="name1").reserved_by is None
        self.d.machine_reserve(label="name1", task_id=42)
        assert self.d.view_machine(name="name1").reserved_by == 42

    def test_clear_reservation(self):
        self.d.add_machine("name1", "name1", "1.2.3.4", "windows", "",
                           "app1,office7", "int0", "snap0", "5.6.7.8", 2043,
                           "virtualbox")
        self.d.machine_reserve(label="name1", task_id=42)
        assert self.d.view_machine(name="name1").reserved_by == 42
        self.d.clear_reservation(label="name1")
        assert self.d.view_machine(name="name1").reserved_by is None

    def test_clean_machines(self):
        for x in range(6):
            name = "name%s" % x
            self.d.add_machine(name, name, "1.2.3.4", "windows", "",
                               "app1,office7", "int0", "snap0", "5.6.7.8",
                               2043, "virtualbox")

        assert len(self.d.list_machines()) == 6
        self.d.clean_machines()
        assert len(self.d.list_machines()) == 0

    def test_target_to_dict(self):
        fd, sample_path = tempfile.mkstemp()
        os.write(fd, os.urandom(64))
        os.close(fd)
        target = File(sample_path)
        id = add_target(sample_path, category="file")
        db_target = self.d.find_target(id=id)
        db_target = db_target.to_dict()

        assert db_target["id"] == id
        assert db_target["file_size"] == 64
        assert db_target["file_type"] == target.get_type()
        assert db_target["md5"] == target.get_md5()
        assert db_target["crc32"] == target.get_crc32()
        assert db_target["sha1"] == target.get_sha1()
        assert db_target["sha256"] == target.get_sha256()
        assert db_target["sha512"] == target.get_sha512()
        assert db_target["ssdeep"] == target.get_ssdeep()
        assert db_target["category"] == "file"
        assert db_target["target"] == sample_path

    def test_task_multiple_targets(self):
        db_targets = []
        task_id = add_task()
        for x in range(10):
            fd, sample_path = tempfile.mkstemp()
            os.write(fd, os.urandom(64))
            os.close(fd)
            add_target(sample_path, category="file", task_id=task_id)

        task = self.d.view_task(task_id)
        assert task.id == task_id
        assert len(task.targets) == 10
Ejemplo n.º 4
0
class Scheduler(object):

    def __init__(self, maxcount=None):
        self.running = True
        self.db = Database()
        self.maxcount = maxcount
        self.total_analysis_count = 0
        self.machinery = None
        self.machine_lock = None
        self.managers = []

    def initialize(self):
        machinery_name = config("cuckoo:cuckoo:machinery")
        max_vmstartup = config("cuckoo:cuckoo:max_vmstartup_count")

        # Initialize a semaphore or lock to prevent to many VMs from
        # starting at the same time.
        self.machine_lock = threading.Semaphore(max_vmstartup)

        log.info(
            "Using '%s' as machine manager", machinery_name,
            extra={
                "action": "init.machinery",
                "status": "success",
                "machinery": machinery_name,
            }
        )

        # Create the machine manager
        self.machinery = cuckoo.machinery.plugins[machinery_name]()

        # Provide a dictionary with the configuration options to the
        # machine manager instance.
        self.machinery.set_options(Config(machinery_name))

        try:
            self.machinery.initialize(machinery_name)
        except CuckooMachineError as e:
            raise CuckooCriticalError("Error initializing machines: %s" % e)

        # At this point all the available machines should have been identified
        # and added to the list. If none were found, Cuckoo aborts the
        # execution. TODO In the future we'll probably want get rid of this.
        machines = self.machinery.machines()
        if not machines:
            raise CuckooCriticalError("No machines available.")

        log.info(
            "Loaded %s machine/s", len(machines),
            extra={
                "action": "init.machines",
                "status": "success",
                "count": len(machines)
            }
        )

        if len(machines) > 1 and self.db.engine.name == "sqlite":
            log.warning(
                "As you've configured Cuckoo to execute parallel "
                "analyses, we recommend you to switch to a MySQL or "
                "a PostgreSQL database as SQLite might cause some "
                "issues."
            )

        if len(machines) > 4 and config("cuckoo:cuckoo:process_results"):
            log.warning(
                "When running many virtual machines it is recommended to "
                "process the results in separate 'cuckoo process' instances "
                "increase throughput and stability. Please read the "
                "documentation about the `Processing Utility`."
            )

        self.drop_forwarding_rules()

        # Command-line overrides the configuration file.
        if self.maxcount is None:
            self.maxcount = config("cuckoo:cuckoo:max_analysis_count")

    def drop_forwarding_rules(self):
        """Drop all existing packet forwarding rules for each VM. Just in case
        Cuckoo was terminated for some reason and various forwarding rules
        have thus not been dropped yet."""
        for machine in self.machinery.machines():
            if not machine.interface:
                log.info(
                    "Unable to determine the network interface for VM "
                     "with name %s, Cuckoo will not be able to give it "
                     "full internet access or route it through a VPN! "
                     "Please define a default network interface for the "
                     "machinery or define a network interface for each "
                     "VM.", machine.name
                )
                continue

            # Drop forwarding rule to each VPN.
            if config("routing:vpn:enabled"):
                for vpn in config("routing:vpn:vpns"):
                    rooter(
                        "forward_disable", machine.interface,
                        config("routing:%s:interface" % vpn), machine.ip
                    )

            # Drop forwarding rule to the internet / dirty line.
            if config("routing:routing:internet") != "none":
                rooter(
                    "forward_disable", machine.interface,
                    config("routing:routing:internet"), machine.ip
                )

    def stop(self):
        """Stop the Cuckoo task scheduler."""
        self.running = False
        # Shutdown machine manager (used to kill machines that still alive).
        for manager in self.managers:
            manager.force_cleanup()

        self.machinery.shutdown()

    def ready_for_new_run(self):
        """Performs checks to see if Cuckoo should start a new
        pending task or not"""
        # Wait until the machine lock is not locked. This is only the case
        # when all machines are fully running, rather that about to start
        # or still busy starting. This way we won't have race conditions
        # with finding out there are no available machines in the analysis
        # manager or having two analyses pick the same machine.
        if not self.machine_lock.acquire(False):
            logger(
                "Could not acquire machine lock",
                action="scheduler.machine_lock", status="busy"
            )
            return False

        self.machine_lock.release()

        # Verify if the minimum amount of disk space is available
        if config("cuckoo:cuckoo:freespace"):
            freespace = get_free_disk(cwd("storage", "analyses"))

            # If freespace is None, the check failed. Continue, since this
            # can happen if the disk check is not supported on others than
            # unix and winxp+. The call might also fail on win32.
            if freespace is None:
                log.error("Error determining free disk space")
            elif freespace <= config("cuckoo:cuckoo:freespace"):
                log.error(
                    "Not enough free disk space! (Only %d MB!)",
                    freespace, extra={
                        "action": "scheduler.diskspace",
                        "status": "error",
                        "available": freespace,
                    }
                )
                return False

        max_vm = config("cuckoo:cuckoo:max_machines_count")
        if max_vm and len(self.machinery.running()) >= max_vm:
            log.debug(
                "Maximum amount of machines is running", extra={
                    "action": "scheduler.machines",
                    "status": "maxed"
                }
            )
            return False

        if not self.machinery.availables():
            logger(
                "No available machines",
                action="scheduler.machines", status="none"
            )
            return False

        return True

    def task_limit_hit(self):
        """Stops the scheduler is the maximum amount of tasks has been
        reached. This can be configured by max_analysis_count in cuckoo.conf
        or passed as an argument when starting Cuckoo."""
        if self.maxcount and self.total_analysis_count >= self.maxcount:
            if not self.managers:
                log.debug(
                    "Reached max analysis count, exiting.", extra={
                        "action": "scheduler.max_analysis",
                        "status": "success",
                        "limit": self.total_analysis_count,
                    }
                )
                self.stop()
                return True

            log.debug(
                "Maximum analyses hit, awaiting active analyses to finish. "
                "Still active: %s", len(self.managers), extra={
                    "action": "scheduler.max_analysis",
                    "status": "busy",
                    "active": len(self.managers)
                }
            )
            return True
        return False

    def handle_pending(self):
        """Handles pending tasks. Checks if a new task can be started. Eg:
        not too many machines already running, disk space left etc. Selects a
        machine matching the task requirements and creates
        a matching analysis manager for the type of the selected pending
        task"""
        # Acquire machine lock non-blocking. This is because the scheduler
        # also handles requests made by analysis manager. A blocking lock
        # could cause a deadlock
        if not self.machine_lock.acquire(False):
            return

        # Select task that is specifically for one of the available machines
        # possibly a service machine or reserved machine
        machine, task, analysis = None, None, False
        for available_machine in self.db.get_available_machines():

            # If the machine has been reserved for a specific task, this
            # task should be processed first, as the machine will only be
            # released it has finished (Example: longterm task).
            if available_machine.reserved_by:
                task = self.db.fetch(task_id=available_machine.reserved_by)
                if task:
                    machine = self.machinery.acquire(
                        machine_id=available_machine.name
                    )
                    break
                continue

            task = self.db.fetch(machine=available_machine.name)
            if task:
                machine = self.machinery.acquire(
                    machine_id=available_machine.name
                )
                break

            if available_machine.is_analysis():
                analysis = True

        # No task for a specific machine and at least one of the available
        # machines is not a service machine. Fetch task that is not
        # for a service machine
        if not task and not machine and analysis:

            # Search for a task, but don't lock it until we are sure a machine
            # for this task is available, since it might have tags or require
            # a specific platform. Ignore a task if we know a machine is not
            # available for it.
            exclude = []
            while not machine:
                task = self.db.fetch(service=False, exclude=exclude)

                if task is None:
                    break

                try:
                    machine = self.machinery.acquire(
                        machine_id=task.machine, platform=task.platform,
                        tags=task.tags
                    )
                except CuckooOperationalError:
                    log.error(
                        "Task #%s cannot be started, no machine with matching "
                        "requirements for this task exists. Requirements: %s",
                        task.id, Task.requirements_str(task)
                    )
                    # No machine with required tags, name etc exists
                    # Set analysis to failed.
                    # TODO Use another status so it might be recovered
                    # on next Cuckoo startup if the machine exists by then
                    self.db.set_status(task.id, TASK_FAILED_ANALYSIS)
                    break

                if not machine:
                    exclude.append(task.id)

        if not task or not machine:
            self.machine_lock.release()
            if machine:
                self.machinery.release(label=machine.label)
            return

        log.info(
            "Task #%d: acquired machine %s (label=%s)",
            task.id, machine.name, machine.label, extra={
                "action": "vm.acquire",
                "status": "success",
                "vmname": machine.name,
            }
        )

        # Task and matching machine found. Find analysis manager
        # which supports the type of this task. Lock it when found
        analysis_manager = self.get_analysis_manager(task, machine)

        if not analysis_manager:
            # If no analysis manager is found for this task type, it
            # cannot be started, therefore we release the machine again
            self.machinery.release(label=machine.label)

            # Release machine lock as the machine will not be starting
            self.machine_lock.release()

            # Set task status to failed as it cannot be analysed if no matching
            # analysis manager for its type exists
            self.db.set_status(task.id, TASK_FAILED_ANALYSIS)
            return

        # Only lock task for running if we are sure we will try to start it
        self.db.set_status(task.id, TASK_RUNNING)

        # Increment the total amount of analyses
        self.total_analysis_count += 1

        analysis_manager.daemon = True
        if not analysis_manager.init(self.db):
            self.db.set_status(task.id, TASK_FAILED_ANALYSIS)
            log.error(
                "Failed to initialize analysis manager for task #%s", task.id
            )
            self.machine_lock.release()
            self.machinery.release(label=machine.label)
            return

        # If initialization succeeded, start the analysis manager
        # and store it so we can track it
        analysis_manager.start()
        self.managers.append(analysis_manager)

    def get_analysis_manager(self, db_task, machine):
        """Searches all available analysis managers for one
        that supports the type of the given task. Returns an
        analysis manager. Returns None if no manager supports the type"""
        managers = cuckoo.analysis.plugins
        analysis_manager = None
        for manager in managers:
            if db_task.type in manager.supports:

                core_task = Task(db_task)

                analysis_manager = manager(
                    machine, self.machinery, self.machine_lock
                )
                try:
                    analysis_manager.set_task(core_task)
                    analysis_manager.set_target(core_task.targets)
                except Exception as e:
                    analysis_manager = None
                    log.exception(
                        "Failure when setting task and target for analysis"
                        " manager '%s'.", manager
                    )
                break

        return analysis_manager

    def handle_managers(self):
        """Executes actions requested by analysis managers. If an analysis
        manager is finished, executes its finalize actions. Returns a
        list of analysis managers to untrack"""
        remove = []
        for manager in self.managers:

            if manager.action_requested():
                status = manager.get_analysis_status()
                status_action = getattr(manager, "on_status_%s" % status, None)
                if status_action:
                    log.debug(
                        "Executing requested action by task #%s for status"
                        " '%s'", manager.task.id, status
                    )
                    try:
                        status_action(self.db)
                    except Exception as e:
                        log.exception(
                            "Error executing requested action: %s. Error: %s",
                            status_action, e
                        )
                else:
                    log.error(
                        "Analysis manager for task #%s requested action for"
                        " status '%s', but no action is implemented",
                        manager.task.id, status
                    )
                manager.action_lock.release()

            if not manager.isAlive():
                manager.finalize(self.db)
                remove.append(manager)

        return remove

    def keep_running(self):
        return self.running

    def start(self):
        """Start the Cuckoo task scheduler"""
        self.initialize()

        log.info("Waiting for analysis tasks")

        while self.keep_running():
            time.sleep(1)

            # Handles actions requested by analysis managers and performs
            # finalization actions for the managers if they exit.
            for untrack_manager in self.handle_managers():
                self.managers.remove(untrack_manager)

            # Verify if the maximum amount of analyses to process has been hit.
            # Stops the scheduler if no running analysis managers are left.
            if self.task_limit_hit():
                continue

            # Handle pending tasks by finding the matching machine and
            # analysis manager. The manager is started added to tracked
            # analysis managers.
            if self.db.count_tasks(status=TASK_PENDING):
                # Check if the max amount of VMs are running, if there is
                # enough disk space, etc.
                if self.ready_for_new_run():
                    # Grab a pending task, find a machine that matches, find
                    # a matching analysis manager and start the analysis.
                    self.handle_pending()

        log.debug("End of analyses.")