def run(self, data, func, split_count=None, timeout=None, push_type=PushTypes.Async): """Use a asyncio to apply func to data Parameters ---------- data An iterable to process func : callable A async callable that will be passed data to operate on using asyncio. split_count : int, optional How many slices to split the data into for concurrent processing. Default is to set split_count = len(data). timeout : int or float, optional Time to wait for jobs to complete before raising an error. Ignored unless using a push_type that waits for results. push_type : str, optional If "async", push the Futures immediately. If "input", push the input data immediately after task submission. If "result", collect the result synchronously and push it. """ split_count = split_count or len(data) splits = divide_data(data, split_count) info("%s: data len: %s, splits: %s" % (self.__class__.__name__, size(data, "n/a"), split_count)) loop, close = get_or_create_event_loop() try: futures = [loop.create_task(func(split)) for split in splits] if push_type == PushTypes.Async: for future in futures: self.push(future) elif push_type == PushTypes.Input: self.push(data) elif push_type == PushTypes.Result: self.push(self.get_results(futures, timeout=timeout)) else: raise AssertionError("Invalid push_type: %s" % push_type) finally: if close and push_type == PushTypes.Result: # We can only be sure its safe to close the event loop if it # was created and all processing took place in here. loop.close()
def consume(pipeline, data, cleanup=None, **node_contexts): """Handles node contexts before/after calling pipeline.consume() Note ---- It would have been better to subclass Pipeline and implement this logic right before/after the core consume() call, but there is a bug in pickle that prevents that from working with multiprocessing. """ update_node_contexts(pipeline, node_contexts) try: contexts = get_node_contexts(pipeline) dbg("size=%s\n%s" % (size(data, "n/a"), pf(contexts)), indent="label") try: if data is None: return consume_none(pipeline) else: return pipeline.consume(iterize(data)) finally: if cleanup: clean_up_nodes(cleanup, contexts) finally: reset_node_contexts(pipeline, node_contexts)
def print(self, data): print("data length: %s" % size(data, "n/a"))
def run(self, data, func, executor=None, executor_kwargs=None, split_count=None, timeout=None, push_type=PushTypes.Async, **kwargs): """Use a parallel executor to apply func to data Parameters ---------- data An iterable to process func : callable A callable that will be passed data to operate on in parallel executor : Executor, optional If passed use this executor instead of creating one. executor_kwargs : dict, optional Keyword arguments to pass when initalizing an executor. split_count : int, optional How many slices to split the data into for parallel processing. Default is to set split_count = number of workers timeout : int or float, optional Time to wait for jobs to complete before raising an error. Ignored unless using a push_type that waits for results. push_type : str, optional If "async", push the Futures immediately. If "input", push the input data immediately after task submission. If "result", collect the result synchronously and push it. **kwargs Keyword arguments passed to the executor when submitting work """ self.check_data(data) shutdown = True if executor: shutdown = False else: executor_kwargs = executor_kwargs or {} executor = self.get_executor(**executor_kwargs) try: worker_count = self.get_worker_count(executor) split_count = split_count or worker_count splits = divide_data(data, split_count) info("%s: data len: %s, splits: %s, workers: %d" % ( self.__class__.__name__, size(data, "n/a"), split_count, worker_count, )) futures = self.submit(executor, func, splits, **kwargs) if push_type == PushTypes.Async: for future in futures: self.push(future) elif push_type == PushTypes.Input: self.push(data) elif push_type == PushTypes.Result: self.push(self.get_results(futures, timeout=timeout)) else: raise AssertionError("Invalid push_type: %s" % push_type) finally: if shutdown: self.shutdown_executor(executor)
def run( self, rows, conn, table, cursor=None, commit=True, rollback=False, stmt_type="REPLACE", odku=False, swap=False, keep_old=False, push_data=False, dry_run=False, ): """Form SQL statement and use bulk execute to write rows to table Parameters ---------- rows Iterable of rows to load to the table conn Database connection table : str Name of a table to write the data to cursor : optional Database connection cursor commit : bool, optional If true try to commit the transaction. If your connection autocommits this will have no effect. If this is a SQLAlchemy connection and you are in a transaction, it will try to get a reference to the current transaction and call commit on that. rollback : bool, optional If true try to rollback the transaction on exceptions. Behavior may vary by backend DB library if you are not currently in a transaction. stmt_type : str, optional Type of SQL statement to use (REPLACE, INSERT, etc.). **Note:** Backend support for this varies. odku : bool or list, optional If true, add ON DUPLICATE KEY UPDATE clause for all columns. If a list then only add it for the specified columns. **Note:** Backend support for this varies. swap : bool, optional If true, load a table and then swap it into the target table via rename. Not supported with all database back ends. keep_old : bool, optional If true and swapping tables, keep the original table with a __old suffix added to the name push_data : bool, optional If true, push the data forward instead of the table name dry_run : bool, optional If true, skip actually loading the data """ load_table = table if swap: load_table = add_table_suffix(table, "__swap") sql = self.get_bulk_statement(conn, stmt_type, load_table, rows, odku=odku) dbg("Loading %d rows\n%s" % (size(rows, "n/a"), sqlformat(sql)), indent="label") if dry_run: warn("dry_run=True, skipping load in %s.run" % self.__class__.__name__) else: if not cursor: cursor = self.get_sql_executor(conn) try: if swap: self.create_like(conn, cursor, load_table, table, drop=True) self.executemany(conn, cursor, sql, rows) if swap: old_table = add_table_suffix(table, "__old") self.rename_tables( conn, cursor, [(table, old_table), (load_table, table)] ) if not keep_old: self.drop_table(conn, cursor, old_table) if commit: self.commit(conn) except: if rollback: self.rollback(conn) raise if push_data: self.push(rows) else: self.push(table)