Beispiel #1
0
    def __init__(self,
                 master=None,
                 workers=None,
                 config_path=None,
                 config=None,
                 debug=False):
        """
        Initializes a Scanner database.

        This will create a database at the `db_path` specified in the config
        if none exists.

        Kwargs:
            config_path: Path to a Scanner configuration TOML, by default
                         assumed to be `~/.scanner.toml`.
            config: A scanner Config object. If specified, config_path is
                    ignored.

        Returns:
            A database instance.
        """
        if config:
            self.config = config
        else:
            self.config = Config(config_path)

        self._debug = debug or (master is None and workers is None)

        self._master = None

        import libscanner as bindings
        self._bindings = bindings

        # Setup database metadata
        self._db_path = self.config.db_path
        self._storage = self.config.storage
        self._cached_db_metadata = None
        self._png_dump_prefix = '__png_dump_{:s}'

        self.ops = OpGenerator(self)
        self.protobufs = ProtobufGenerator(self.config)

        self.start_cluster(master, workers)

        # Initialize database if it does not exist
        pydb_path = '{}/pydb'.format(self._db_path)

        pydbpath_info = self._storage.get_file_info(pydb_path + '/')

        if not (pydbpath_info.file_exists and pydbpath_info.file_is_folder):
            self._storage.make_dir(pydb_path)
            self._collections = self.protobufs.CollectionsDescriptor()
            self._update_collections()

        # Load database descriptors from disk
        self._collections = self._load_descriptor(
            self.protobufs.CollectionsDescriptor, 'pydb/descriptor.bin')
Beispiel #2
0
class Database:
    """
    Entrypoint for all Scanner operations.

    Attributes:
        config: The Config object for the database.
        ops: An OpGenerator object for computation creation.
        protobufs: TODO(wcrichto)
    """
    def __init__(self,
                 master=None,
                 workers=None,
                 config_path=None,
                 config=None,
                 debug=False):
        """
        Initializes a Scanner database.

        This will create a database at the `db_path` specified in the config
        if none exists.

        Kwargs:
            config_path: Path to a Scanner configuration TOML, by default
                         assumed to be `~/.scanner.toml`.
            config: A scanner Config object. If specified, config_path is
                    ignored.

        Returns:
            A database instance.
        """
        if config:
            self.config = config
        else:
            self.config = Config(config_path)

        self._debug = debug or (master is None and workers is None)

        self._master = None

        import libscanner as bindings
        self._bindings = bindings

        # Setup database metadata
        self._db_path = self.config.db_path
        self._storage = self.config.storage
        self._cached_db_metadata = None
        self._png_dump_prefix = '__png_dump_{:s}'

        self.ops = OpGenerator(self)
        self.protobufs = ProtobufGenerator(self.config)

        self.start_cluster(master, workers)

        # Initialize database if it does not exist
        pydb_path = '{}/pydb'.format(self._db_path)

        pydbpath_info = self._storage.get_file_info(pydb_path + '/')

        if not (pydbpath_info.file_exists and pydbpath_info.file_is_folder):
            self._storage.make_dir(pydb_path)
            self._collections = self.protobufs.CollectionsDescriptor()
            self._update_collections()

        # Load database descriptors from disk
        self._collections = self._load_descriptor(
            self.protobufs.CollectionsDescriptor, 'pydb/descriptor.bin')

    def __del__(self):
        self.stop_cluster()

    def __enter__(self):
        return self

    def __exit__(self, exception_type, exception_val, exception_tb):
        self.stop_cluster()
        del self._db

    def get_build_flags(self):
        """
        Gets the g++ build flags for compiling custom ops.

        For example, to compile a custom kernel:
        \code{.sh}
        export SCANNER_FLAGS=`python -c "import scannerpy as sp; print(sp.Database().get_build_flags())"`
        g++ mykernel.cpp -o mylib.so `echo $SCANNER_FLAGS`
        \endcode

        Returns:
           A flag string.
        """

        include_dirs = self._bindings.get_include().split(";")
        include_dirs.append(self.config.module_dir + "/include")
        include_dirs.append(self.config.module_dir + "/build")
        flags = '{include} -std=c++11 -fPIC -shared -L{libdir} -lscanner {other}'
        return flags.format(include=" ".join(["-I " + d
                                              for d in include_dirs]),
                            libdir='{}/build'.format(self.config.module_dir),
                            other=self._bindings.other_flags())

    def print_build_flags(self):
        sys.stdout.write(self.get_build_flags())

    def summarize(self):
        summary = ''
        db_meta = self._load_db_metadata()
        if len(db_meta.tables) == 0:
            return 'Your database is empty!'

        tables = [
            ('TABLES', [
                ('Name', [t.name for t in db_meta.tables]),
                ('# rows',
                 [str(self.table(t.id).num_rows()) for t in db_meta.tables]),
                ('Columns', [
                    ', '.join(self.table(t.id).column_names())
                    for t in db_meta.tables
                ]),
            ]),
        ]

        if len(self._collections.names) > 0:
            tables.append(('COLLECTIONS',
                           [('Name', self._collections.names),
                            ('# tables', [
                                str(len(self.collection(id).table_names()))
                                for id in self._collections.ids
                            ])]))

        for table_idx, (label, cols) in enumerate(tables):
            if table_idx > 0:
                summary += '\n\n'
            num_cols = len(cols)
            max_col_lens = [
                max(max([len(s) for s in c] or [0]), len(name))
                for name, c in cols
            ]
            table_width = sum(max_col_lens) + 3 * (num_cols - 1)
            label = '** {} **'.format(label)
            summary += ' ' * (table_width / 2 - len(label) / 2) + label + '\n'
            summary += '-' * table_width + '\n'
            col_name_fmt = ' | '.join(['{{:{}}}' for _ in range(num_cols)])
            col_name_fmt = col_name_fmt.format(*max_col_lens)
            summary += col_name_fmt.format(*[s for s, _ in cols]) + '\n'
            summary += '-' * table_width + '\n'
            row_fmt = ' | '.join(['{{:{}}}' for _ in range(num_cols)])
            row_fmt = row_fmt.format(*max_col_lens)
            for i in range(len(cols[0][1])):
                summary += row_fmt.format(*[c[i] for _, c in cols]) + '\n'
        return summary

    def _load_descriptor(self, descriptor, path):
        d = descriptor()
        d.ParseFromString(
            self._storage.read('{}/{}'.format(self._db_path, path)))
        return d

    def _save_descriptor(self, descriptor, path):
        self._storage.write('{}/{}'.format(self._db_path, path),
                            descriptor.SerializeToString())

    def _load_db_metadata(self):
        if self._cached_db_metadata is None:
            desc = self._load_descriptor(self.protobufs.DatabaseDescriptor,
                                         'db_metadata.bin')
            self._cached_db_metadata = desc
        return self._cached_db_metadata

    def _connect_to_master(self):
        channel = grpc.insecure_channel(self._master_address,
                                        options=[('grpc.max_message_length',
                                                  24499183 * 2)])
        self._master = self.protobufs.MasterStub(channel)
        result = False
        try:
            self._master.Ping(self.protobufs.Empty())
            result = True
        except grpc.RpcError as e:
            status = e.code()
            if status == grpc.StatusCode.UNAVAILABLE:
                pass
            elif status == grpc.StatusCode.OK:
                result = True
            else:
                raise ScannerException(
                    'Master ping errored with status: {}'.format(status))
        return result

    def _run_remote_cmd(self, host, cmd):
        host_ip, _, _ = host.partition(':')
        host_ip = unicode(socket.gethostbyname(host_ip), "utf-8")
        if ipaddress.ip_address(host_ip).is_loopback:
            return Popen(cmd, shell=True)
        else:
            cmd = cmd.replace('"', '\\"')
            return Popen("ssh {} \"cd {} && {}\"".format(
                host_ip, os.getcwd(), cmd),
                         shell=True)

    def _start_heartbeat(self):
        # Start up heartbeat to keep master alive
        def heartbeat_task(q, master_address):
            import scanner.metadata_pb2 as metadata_types
            import scanner.engine.rpc_pb2 as rpc_types
            import scanner.types_pb2 as misc_types
            import libscanner as bindings

            channel = grpc.insecure_channel(master_address,
                                            options=[
                                                ('grpc.max_message_length',
                                                 24499183 * 2)
                                            ])
            master = rpc_types.MasterStub(channel)
            while q.empty():
                master.PokeWatchdog(rpc_types.Empty())
                time.sleep(1)

        self._heartbeat_queue = Queue()
        self._heartbeat_process = Process(target=heartbeat_task,
                                          args=(self._heartbeat_queue,
                                                self._master_address))
        self._heartbeat_process.daemon = True
        self._heartbeat_process.start()

    def _stop_heartbeat(self):
        self._heartbeat_queue.put(0)
        self._heartbeat_process.join()

    def start_cluster(self, master, workers):
        """
        Starts  a Scanner cluster.

        Args:
            master: ssh-able address of the master node.
            workers: list of ssh-able addresses of the worker nodes.
        """

        if master is None:
            self._master_address = (self.config.master_address + ':' +
                                    self.config.master_port)
        else:
            self._master_address = master
        if workers is None:
            self._worker_addresses = [
                self.config.master_address + ':' + self.config.worker_port
            ]
        else:
            self._worker_addresses = workers

        # Boot up C++ database bindings
        self._db = self._bindings.Database(self.config.storage_config,
                                           self._db_path, self._master_address)

        if self._debug:
            self._master_conn = None
            self._worker_conns = None
            machine_params = self._bindings.default_machine_params()
            res = self._bindings.start_master(self._db,
                                              self.config.master_port).success
            assert res
            res = self._connect_to_master()
            assert res

            self._start_heartbeat()

            for i in range(len(self._worker_addresses)):
                res = self._bindings.start_worker(
                    self._db, machine_params,
                    str(int(self.config.worker_port) + i)).success
                assert res
        else:
            master_port = self._master_address.partition(':')[2]
            pickled_config = pickle.dumps(self.config)
            master_cmd = (
                'python -c ' + '\"from scannerpy import start_master\n' +
                'import pickle\n' +
                'config=pickle.loads(\'\'\'{config:s}\'\'\')\n' +
                'start_master(port=\'{master_port:s}\', block=True, config=config)\"'
            ).format(master_port=master_port, config=pickled_config)
            worker_cmd = (
                'python -c ' + '\"from scannerpy import start_worker\n' +
                'import pickle\n' +
                'config=pickle.loads(\'\'\'{config:s}\'\'\')\n' +
                'start_worker(\'{master:s}\', port=\'{worker_port:s}\', block=True, config=config)\"'
            )
            self._master_conn = self._run_remote_cmd(self._master_address,
                                                     master_cmd)

            # Wait for master to start
            slept_so_far = 0
            sleep_time = 20
            while slept_so_far < sleep_time:
                if self._connect_to_master():
                    break
                time.sleep(0.3)
                slept_so_far += 0.3
            if slept_so_far >= sleep_time:
                self._master_conn.kill()
                self._master_conn = None
                raise ScannerException(
                    'Timed out waiting to connect to master')
            # Start up heartbeat to keep master alive
            self._start_heartbeat()

            # Start workers now that master is ready
            self._worker_conns = [
                self._run_remote_cmd(
                    w,
                    worker_cmd.format(master=self._master_address,
                                      config=pickled_config,
                                      worker_port=w.partition(':')[2]))
                for w in self._worker_addresses
            ]
            slept_so_far = 0
            # Has to be this long for GCS
            sleep_time = 60
            while slept_so_far < sleep_time:
                active_workers = self._master.ActiveWorkers(
                    self.protobufs.Empty())
                if (len(active_workers.workers) > len(self._worker_addresses)):
                    raise ScannerException(
                        ('Master has more workers than requested ' +
                         '({:d} vs {:d})').format(len(active_workers.workers),
                                                  len(self._worker_addresses)))
                if (len(active_workers.workers) == len(
                        self._worker_addresses)):
                    break
                time.sleep(0.3)
                slept_so_far += 0.3
            if slept_so_far >= sleep_time:
                self._master_conn.kill()
                for wc in self._worker_conns:
                    wc.kill()
                self._master_conn = None
                self._worker_conns = None
                raise ScannerException(
                    'Timed out waiting for workers to connect to master')

        # Load stdlib
        stdlib_path = '{}/build/stdlib'.format(self.config.module_dir)
        self.load_op('{}/libstdlib.so'.format(stdlib_path),
                     '{}/stdlib_pb2.py'.format(stdlib_path))

    def stop_cluster(self):
        if self._master:
            # Stop heartbeat
            self._stop_heartbeat()
            try:
                self._try_rpc(
                    lambda: self._master.Shutdown(self.protobufs.Empty()))
            except:
                pass
            self._master = None
        if self._master_conn:
            self._master_conn.kill()
            self._master_conn = None
        if self._worker_conns:
            for wc in self._worker_conns:
                wc.kill()
            self._worker_conns = None

    def _try_rpc(self, fn):
        try:
            result = fn()
        except grpc.RpcError as e:
            raise ScannerException(e)

        if isinstance(result, self.protobufs.Result):
            if not result.success:
                raise ScannerException(result.msg)

        return result

    def load_op(self, so_path, proto_path=None):
        """
        Loads a custom op into the Scanner runtime.

        By convention, if the op requires arguments from Python, it must
        have a protobuf message called <OpName>Args, e.g. BlurArgs or
        HistogramArgs, and the path to that protobuf should be provided.

        Args:
            so_path: Path to the custom op's shared object file.

        Kwargs:
            proto_path: Path to the custom op's arguments protobuf
                        if one exists.
        """
        if proto_path is not None:
            self.protobufs.add_module(proto_path)
        op_path = self.protobufs.OpPath()
        op_path.path = so_path
        self._try_rpc(lambda: self._master.LoadOp(op_path))

    def register_python_op(self, kernel_path):
        kernel_path = os.path.abspath(kernel_path)

        def make_op(*args, **kwargs):
            return self.ops.Python(*args,
                                   py_args=pickle.dumps(kwargs),
                                   kernel_path=kernel_path)

        return make_op

    def _update_collections(self):
        self._save_descriptor(self._collections, 'pydb/descriptor.bin')

    def delete_collection(self, collection_name):
        if collection_name not in self._collections.names:
            raise ScannerException(
                'Collection with name {} does not exist'.format(
                    collection_name))

        index = self._collections.names[:].index(collection_name)
        id = self._collections.ids[index]
        del self._collections.names[index]
        del self._collections.ids[index]

        self._storage.delete_file('{}/pydb/collection_{}.bin'.format(
            self._db_path, id))

    def new_collection(self,
                       collection_name,
                       table_names,
                       force=False,
                       job_id=None):
        """
        Creates a new Collection from a list of tables.

        Args:
            collection_name: String name of the collection to create.
            table_names: List of table name strings to put in the collection.

        Kwargs:
            force: TODO(wcrichto)
            job_id: TODO(wcrichto)

        Returns:
            The new Collection object.
        """

        if collection_name in self._collections.names:
            if force:
                self.delete_collection(collection_name)
            else:
                raise ScannerException(
                    'Collection with name {} already exists'.format(
                        collection_name))

        last_id = self._collections.ids[-1] if len(
            self._collections.ids) > 0 else -1
        new_id = last_id + 1
        self._collections.ids.append(new_id)
        self._collections.names.append(collection_name)
        self._update_collections()
        collection = self.protobufs.CollectionDescriptor()
        collection.tables.extend(table_names)
        collection.job_id = -1 if job_id is None else job_id
        self._save_descriptor(collection,
                              'pydb/collection_{}.bin'.format(new_id))

        return self.collection(collection_name)

    def ingest_videos(self, videos, force=False):
        """
        Creates a Table from a video.

        Args:
            videos: TODO(wcrichto)


        Kwargs:
            force: TODO(wcrichto)

        Returns:
            (list of created Tables, list of (path, reason) failures to ingest)
        """

        if len(videos) == 0:
            raise ScannerException('Must ingest at least one video.')

        [table_names, paths] = zip(*videos)
        for table_name in table_names:
            if self.has_table(table_name):
                if force is True:
                    self._delete_table(table_name)
                else:
                    raise ScannerException(
                        'Attempted to ingest over existing table {}'.format(
                            table_name))
        self._save_descriptor(self._load_db_metadata(), 'db_metadata.bin')
        ingest_params = self.protobufs.IngestParameters()
        ingest_params.table_names.extend(table_names)
        ingest_params.video_paths.extend(paths)
        ingest_result = self._try_rpc(
            lambda: self._master.IngestVideos(ingest_params))
        if not ingest_result.result.success:
            raise ScannerException(ingest_result.result.msg)
        failures = zip(ingest_result.failed_paths,
                       ingest_result.failed_messages)

        self._cached_db_metadata = None
        return ([
            self.table(t) for (t, p) in videos
            if p not in ingest_result.failed_paths
        ], failures)

    def ingest_video_collection(self, collection_name, videos, force=False):
        """
        Creates a Collection from a list of videos.

        Args:
            collection_name: String name of the Collection to create.
            videos: List of video paths.

        Kwargs:
            force: TODO(wcrichto)

        Returns:
            (Collection, list of (path, reason) failures to ingest)
        """
        table_names = [
            '{}:{:03d}'.format(collection_name, i) for i in range(len(videos))
        ]
        tables, failures = self.ingest_videos(zip(table_names, videos), force)
        collection = self.new_collection(collection_name,
                                         [t.name() for t in tables], force)
        return collection, failures

    def has_collection(self, name):
        return name in self._collections.names

    def collection(self, name):
        if isinstance(name, basestring):
            index = self._collections.names[:].index(name)
            id = self._collections.ids[index]
        else:
            id = name
        collection = self._load_descriptor(self.protobufs.CollectionDescriptor,
                                           'pydb/collection_{}.bin'.format(id))
        return Collection(self, name, collection)

    def has_table(self, name):
        db_meta = self._load_db_metadata()
        for table in db_meta.tables:
            if table.name == name:
                return True
        return False

    def _delete_table(self, name):
        table = self.table(name)
        db_meta = self._load_db_metadata()
        for i, t in enumerate(db_meta.tables):
            if t.id == table.id():
                del db_meta.tables[i]
                return
        assert False

    def delete_table(self, name):
        self._delete_table(name)
        self._save_descriptor(self._load_db_metadata(), 'db_metadata.bin')

    def new_table(self, name, columns, rows, fn=None, force=False):
        """
        Creates a new table from a list of rows.

        Args:
            name: String name of the table to create
            columns: List of names of table columns
            rows: List of rows with each row a list of elements corresponding
                  to the specified columns. Elements must be strings of
                  serialized representations of the data.

        Kwargs:
            fn: TODO(wcrichto)
            force: TODO(apoms)

        Returns:
            The new table object.
        """

        if self.has_table(name):
            if force:
                self.delete_table(name)
            else:
                raise ScannerException(
                    'Attempted to create table with existing '
                    'name {}'.format(name))
        if fn is not None:
            rows = [fn(row) for row in rows]
        cols = copy.copy(columns)
        cols.insert(0, "index")
        for i, row in enumerate(rows):
            row.insert(0, struct.pack('=Q', i))
        self._bindings.new_table(self._db, name, cols, rows)
        self._cached_db_metadata = None
        return self.table(name)

    def table(self, name):
        db_meta = self._load_db_metadata()

        if isinstance(name, basestring):
            table_id = None
            for table in db_meta.tables:
                if table.name == name:
                    table_id = table.id
                    break
            if table_id is None:
                raise ScannerException(
                    'Table with name {} not found'.format(name))
            for table in db_meta.tables:
                if table.name == name and table.id != table_id:
                    raise ScannerException(
                        'Internal error: multiple tables with same name: {}'.
                        format(name))
        elif isinstance(name, int):
            table_id = name
        else:
            raise ScannerException('Invalid table identifier')

        descriptor = self._load_descriptor(
            self.protobufs.TableDescriptor,
            'tables/{}/descriptor.bin'.format(table_id))
        return Table(self, descriptor)

    def profiler(self, job_name):
        db_meta = self._load_db_metadata()
        if isinstance(job_name, basestring):
            job_id = None
            for job in db_meta.jobs:
                if job.name == job_name:
                    job_id = job.id
                    break
            if job_id is None:
                raise ScannerException(
                    'Job name {} does not exist'.format(job_name))
        else:
            job_id = job_name

        return Profiler(self, job_id)

    def _get_op_info(self, op_name):
        op_info_args = self.protobufs.OpInfoArgs()
        op_info_args.op_name = op_name

        op_info = self._try_rpc(lambda: self._master.GetOpInfo(op_info_args))

        if not op_info.result.success:
            raise ScannerException(op_info.result.msg)

        return op_info

    def _check_has_op(self, op_name):
        self._get_op_info(op_name)

    def _get_input_columns(self, op_name):
        return self._get_op_info(op_name).input_columns

    def _get_output_columns(self, op_name):
        return self._get_op_info(op_name).output_columns

    def _toposort(self, job):
        op = job.op(self)
        edges = defaultdict(list)
        in_edges_left = defaultdict(int)
        input_tables = []

        # Coalesce multiple inputs into a single table
        start_node = self.ops.Input([], None, None)
        explored_nodes = set()
        stack = [op]
        to_change = []
        while len(stack) > 0:
            c = stack.pop()
            explored_nodes.add(c)

            for input in c._inputs:
                if input._op._name == "InputTable" and input._op != start_node:
                    if not input._op in input_tables:
                        input_tables.append(input._op)
                    idx = input_tables.index(input._op)
                    to_change.append((input, idx))
                    input._op = start_node

                if input._op not in explored_nodes:
                    stack.append(input._op)

        def input_col_name(col, idx):
            if len(input_tables) > 1 and idx != 0:
                return '{}{:d}'.format(col, idx)
            else:
                return col

        for (input, idx) in to_change:
            input._col = input_col_name(input._col, idx)

        new_start_node_inputs = []
        for i, t in enumerate(input_tables):
            for c in t._inputs:
                col = Column(c._table, c._descriptor, c._video_descriptor)
                col._name = input_col_name(c._descriptor.name, i)
                new_start_node_inputs.append(col)
        start_node._inputs = new_start_node_inputs

        # Perform DFS on modified graph
        explored_nodes = set([start_node])
        stack = [op]
        while len(stack) > 0:
            c = stack.pop()
            explored_nodes.add(c)

            if c._name == "InputTable": continue

            for input in c._inputs:
                edges[input._op].append(c)
                in_edges_left[c] += 1

                if input._op not in explored_nodes:
                    stack.append(input._op)

        # Compute sorted list
        eval_sorted = []
        eval_index = {}
        stack = [start_node]
        while len(stack) > 0:
            c = stack.pop()
            eval_sorted.append(c)
            eval_index[c] = len(eval_sorted) - 1
            for child in edges[c]:
                in_edges_left[child] -= 1
                if in_edges_left[child] == 0:
                    stack.append(child)

        for c in eval_sorted[1:]:
            for i in c._inputs:
                if i._op in input_tables:
                    idx = input_tables.index(i._op)
                    i._col = input_col_name(i._col, idx)

        eval_sorted[-1]._inputs.insert(
            0, OpColumn(self, eval_sorted[0], "index", self.protobufs.Other))

        task = input_tables[0]._generator()
        if job.name() is not None:
            task.output_table_name = job.name()

        for t in input_tables[1:]:
            task.samples.extend(t._generator().samples)

        return [e.to_proto(eval_index) for e in eval_sorted], \
          task, input_tables[0]

    def _parse_size_string(self, s):
        (prefix, suffix) = (s[:-1], s[-1])
        mults = {'G': 1024**3, 'M': 1024**2, 'K': 1024**1}
        suffix = suffix.upper()
        if suffix not in mults:
            raise ScannerException('Invalid size suffix in "{}"'.format(s))
        return int(prefix) * mults[suffix]

    def run(self,
            jobs,
            force=False,
            work_item_size=250,
            cpu_pool=None,
            gpu_pool=None,
            pipeline_instances_per_node=None,
            show_progress=True,
            profiling=False,
            load_sparsity_threshold=8,
            tasks_in_queue_per_pu=4):
        """
        Runs a computation over a set of inputs.

        Args:
            tasks: The set of inputs to run the computation on. If tasks is a
                   Collection, then the computation is run on all frames of all
                   tables in the collection. Otherwise, tasks should be generated
                   by the Sampler.
            outputs: TODO(wcrichto)

        Kwargs:
            output_collection: If this is not None, then a new collection with
                               this name will be created for all the output
                               tables.
            force: TODO(wcrichto)
            work_item_size: TODO(wcrichto)
            cpu_pool: TODO(wcrichto)
            gpu_pool: TODO(wcrichto)
            pipeline_instances_per_node: TODO(wcrichto)
            show_progress: TODO(wcrichto)

        Returns:
            Either the output Collection if output_collection is specified
            or a list of Table objects.
        """

        # Get compression annotations

        compression_options = []
        # For index column
        opts = self.protobufs.OutputColumnCompression()
        opts.codec = 'default'
        compression_options.append(opts)
        output_op = jobs[0].op(self) if isinstance(jobs,
                                                   list) else jobs.op(self)
        for out_col in output_op.inputs():
            opts = self.protobufs.OutputColumnCompression()
            opts.codec = 'default'
            if out_col._type == self.protobufs.Video:
                for k, v in out_col._encode_options.iteritems():
                    if k == 'codec':
                        opts.codec = v
                    else:
                        opts.options[k] = str(v)
            compression_options.append(opts)

        output_collection = None
        if isinstance(jobs, list):
            ops, task, _ = self._toposort(jobs[0])
            tasks = [task] + [self._toposort(job)[1] for job in jobs[1:]]
        else:
            job = jobs
            ops, task, input_op = self._toposort(job)
            tasks = [task]
            collection = input_op._collection
            if collection is not None:
                output_collection = job.name()
                if self.has_collection(output_collection) and not force:
                    raise ScannerException(
                        'Collection with name {} already exists'.format(
                            output_collection))
                for t in collection.tables()[1:]:
                    t_task = input_op._generator(t)
                    t_task.output_table_name = '{}:{}'.format(
                        output_collection,
                        t.name().split(':')[-1])
                    tasks.append(t_task)

        for task in tasks:
            if self.has_table(task.output_table_name):
                if force:
                    self._delete_table(task.output_table_name)
                else:
                    raise ScannerException(
                        'Job would overwrite existing table {}'.format(
                            task.output_table_name))
        self._save_descriptor(self._load_db_metadata(), 'db_metadata.bin')

        job_params = self.protobufs.JobParameters()
        job_name = ''.join(choice(ascii_uppercase) for _ in range(12))
        job_params.job_name = job_name
        job_params.task_set.tasks.extend(tasks)
        job_params.task_set.ops.extend(ops)
        job_params.task_set.compression.extend(compression_options)
        job_params.pipeline_instances_per_node = pipeline_instances_per_node or -1
        job_params.work_item_size = work_item_size
        job_params.show_progress = show_progress
        job_params.profiling = profiling
        job_params.tasks_in_queue_per_pu = tasks_in_queue_per_pu
        job_params.load_sparsity_threshold = load_sparsity_threshold

        job_params.memory_pool_config.pinned_cpu = False
        if cpu_pool is not None:
            job_params.memory_pool_config.cpu.use_pool = True
            if cpu_pool[0] == 'p':
                job_params.memory_pool_config.pinned_cpu = True
                cpu_pool = cpu_pool[1:]
            size = self._parse_size_string(cpu_pool)
            job_params.memory_pool_config.cpu.free_space = size

        if gpu_pool is not None:
            job_params.memory_pool_config.gpu.use_pool = True
            size = self._parse_size_string(gpu_pool)
            job_params.memory_pool_config.gpu.free_space = size

        # Run the job
        self._try_rpc(lambda: self._master.NewJob(job_params))

        # Invalidate db metadata because of job run
        self._cached_db_metadata = None

        db_meta = self._load_db_metadata()
        job_id = None
        for job in db_meta.jobs:
            if job.name == job_name:
                job_id = job.id
        if job_id is None:
            raise ScannerException(
                'Internal error: job id not found after run')

        # Return a new collection if the input was a collection, otherwise
        # return a table list
        table_names = [task.output_table_name for task in tasks]
        if output_collection is not None:
            return self.new_collection(output_collection, table_names, force,
                                       job_id)
        else:
            if isinstance(jobs, list):
                return [self.table(t) for t in table_names]
            else:
                return self.table(table_names[0])