Esempio n. 1
0
    def run(self,
            data,
            func,
            split_count=None,
            timeout=None,
            push_type=PushTypes.Async):
        """Use a asyncio to apply func to data

        Parameters
        ----------
        data
            An iterable to process
        func : callable
            A async callable that will be passed data to operate on using asyncio.
        split_count : int, optional
            How many slices to split the data into for concurrent processing. Default
            is to set split_count = len(data).
        timeout : int or float, optional
            Time to wait for jobs to complete before raising an error. Ignored
            unless using a push_type that waits for results.
        push_type : str, optional
            If "async", push the Futures immediately.
            If "input", push the input data immediately after task submission.
            If "result", collect the result synchronously and push it.

        """
        split_count = split_count or len(data)
        splits = divide_data(data, split_count)
        info("%s: data len: %s, splits: %s" %
             (self.__class__.__name__, size(data, "n/a"), split_count))

        loop, close = get_or_create_event_loop()

        try:
            futures = [loop.create_task(func(split)) for split in splits]

            if push_type == PushTypes.Async:
                for future in futures:
                    self.push(future)
            elif push_type == PushTypes.Input:
                self.push(data)
            elif push_type == PushTypes.Result:
                self.push(self.get_results(futures, timeout=timeout))
            else:
                raise AssertionError("Invalid push_type: %s" % push_type)
        finally:
            if close and push_type == PushTypes.Result:
                # We can only be sure its safe to close the event loop if it
                # was created and all processing took place in here.
                loop.close()
Esempio n. 2
0
def consume(pipeline, data, cleanup=None, **node_contexts):
    """Handles node contexts before/after calling pipeline.consume()

    Note
    ----
    It would have been better to subclass Pipeline and implement this logic
    right before/after the core consume() call, but there is a bug in pickle
    that prevents that from working with multiprocessing.

    """
    update_node_contexts(pipeline, node_contexts)
    try:
        contexts = get_node_contexts(pipeline)
        dbg("size=%s\n%s" % (size(data, "n/a"), pf(contexts)), indent="label")
        try:
            if data is None:
                return consume_none(pipeline)
            else:
                return pipeline.consume(iterize(data))
        finally:
            if cleanup:
                clean_up_nodes(cleanup, contexts)
    finally:
        reset_node_contexts(pipeline, node_contexts)
Esempio n. 3
0
 def print(self, data):
     print("data length: %s" % size(data, "n/a"))
Esempio n. 4
0
    def run(self,
            data,
            func,
            executor=None,
            executor_kwargs=None,
            split_count=None,
            timeout=None,
            push_type=PushTypes.Async,
            **kwargs):
        """Use a parallel executor to apply func to data

        Parameters
        ----------
        data
            An iterable to process
        func : callable
            A callable that will be passed data to operate on in parallel
        executor : Executor, optional
            If passed use this executor instead of creating one.
        executor_kwargs : dict, optional
            Keyword arguments to pass when initalizing an executor.
        split_count : int, optional
            How many slices to split the data into for parallel processing. Default
            is to set split_count = number of workers
        timeout : int or float, optional
            Time to wait for jobs to complete before raising an error. Ignored
            unless using a push_type that waits for results.
        push_type : str, optional
            If "async", push the Futures immediately.
            If "input", push the input data immediately after task submission.
            If "result", collect the result synchronously and push it.
        **kwargs
            Keyword arguments passed to the executor when submitting work

        """
        self.check_data(data)

        shutdown = True
        if executor:
            shutdown = False
        else:
            executor_kwargs = executor_kwargs or {}
            executor = self.get_executor(**executor_kwargs)

        try:
            worker_count = self.get_worker_count(executor)
            split_count = split_count or worker_count
            splits = divide_data(data, split_count)
            info("%s: data len: %s, splits: %s, workers: %d" % (
                self.__class__.__name__,
                size(data, "n/a"),
                split_count,
                worker_count,
            ))
            futures = self.submit(executor, func, splits, **kwargs)

            if push_type == PushTypes.Async:
                for future in futures:
                    self.push(future)
            elif push_type == PushTypes.Input:
                self.push(data)
            elif push_type == PushTypes.Result:
                self.push(self.get_results(futures, timeout=timeout))
            else:
                raise AssertionError("Invalid push_type: %s" % push_type)

        finally:
            if shutdown:
                self.shutdown_executor(executor)
Esempio n. 5
0
    def run(
        self,
        rows,
        conn,
        table,
        cursor=None,
        commit=True,
        rollback=False,
        stmt_type="REPLACE",
        odku=False,
        swap=False,
        keep_old=False,
        push_data=False,
        dry_run=False,
    ):
        """Form SQL statement and use bulk execute to write rows to table

        Parameters
        ----------
        rows
            Iterable of rows to load to the table
        conn
            Database connection
        table : str
            Name of a table to write the data to
        cursor : optional
            Database connection cursor
        commit : bool, optional
            If true try to commit the transaction. If your connection
            autocommits this will have no effect. If this is a SQLAlchemy
            connection and you are in a transaction, it will try to get a
            reference to the current transaction and call commit on that.
        rollback : bool, optional
            If true try to rollback the transaction on exceptions. Behavior
            may vary by backend DB library if you are not currently in a
            transaction.
        stmt_type : str, optional
            Type of SQL statement to use (REPLACE, INSERT, etc.). **Note:** Backend
            support for this varies.
        odku : bool or list, optional
            If true, add ON DUPLICATE KEY UPDATE clause for all columns. If a
            list then only add it for the specified columns. **Note:** Backend
            support for this varies.
        swap : bool, optional
            If true, load a table and then swap it into the target table via rename.
            Not supported with all database back ends.
        keep_old : bool, optional
            If true and swapping tables, keep the original table with a __old
            suffix added to the name
        push_data : bool, optional
            If true, push the data forward instead of the table name
        dry_run : bool, optional
            If true, skip actually loading the data

        """
        load_table = table
        if swap:
            load_table = add_table_suffix(table, "__swap")

        sql = self.get_bulk_statement(conn, stmt_type, load_table, rows, odku=odku)
        dbg("Loading %d rows\n%s" % (size(rows, "n/a"), sqlformat(sql)), indent="label")

        if dry_run:
            warn("dry_run=True, skipping load in %s.run" % self.__class__.__name__)
        else:
            if not cursor:
                cursor = self.get_sql_executor(conn)

            try:
                if swap:
                    self.create_like(conn, cursor, load_table, table, drop=True)

                self.executemany(conn, cursor, sql, rows)

                if swap:
                    old_table = add_table_suffix(table, "__old")
                    self.rename_tables(
                        conn, cursor, [(table, old_table), (load_table, table)]
                    )
                    if not keep_old:
                        self.drop_table(conn, cursor, old_table)

                if commit:
                    self.commit(conn)
            except:
                if rollback:
                    self.rollback(conn)
                raise

        if push_data:
            self.push(rows)
        else:
            self.push(table)