def check_conn(self, conn): """Check the database connection""" raiseifnot( self._is_allowed_conn(conn), "Connection type %s is not in allowed types: %s" % (type(conn), self.allowed_conn_types), )
def commit(self, obj): """Commit any currently active transactions""" if hasattr(obj, "commit"): obj.commit() elif is_sqlalchemy_conn(obj): # Hack: I don't want to have to pass around the transaction # between nodes since that requirement is really specific to # SQLAlchemy, and SQLAlchemy doesn't seem to provide a standard # way of obtaining the current transaction, so this approach is # lifted from the SQLAlchemy internals. raiseifnot( hasattr(obj, "_Connection__transaction"), "Could not find transaction attribute on SQLAlchemy object: %s" % obj, ) if getattr(obj, "_Connection__transaction", None): obj._Connection__transaction.commit() else: # SQLAlchemy connections autocommit by default, so we assume # that happened. pass else: raise AssertionError( "Could not determine how to commit with object: %s" % obj)
def create_like(self, conn, cursor, table, like_table, drop=False): """Create a table like another table, optionally trying to drop `table` first""" table = escape_string(str(table).strip("`")) like_table = escape_string(str(like_table).strip("`")) if drop: drop_sql = "drop table if exists %s" % table dbg(drop_sql) self.execute(conn, cursor, drop_sql) if isinstance(conn, sqlite3.Connection): get_create_sql = ( "SELECT sql FROM sqlite_master WHERE type='table' AND name=?") qr = self.execute(conn, cursor, get_create_sql, params=(like_table, )) row = qr.fetchone() raiseifnot(isinstance(row, sqlite3.Row), "Only sqlite3.Row rows are supported") create_sql = row["sql"].replace(like_table, table) else: # Assume this syntax works with most other SQL databases create_sql = "create table %s like %s" % (table, like_table) dbg(create_sql) self.execute(conn, cursor, create_sql)
def run(self, df, func, from_pandas_kwargs=None, **kwargs): """Convert to dask dataframe and use apply() NOTE: it may be more efficient to not convert to/from Dask Dataframe in this manner depending on the pipeline Parameters ---------- df : pandas.DataFrame The pandas DataFrame to apply func to func : callable A callable that will be passed to Dask DataFrame.apply from_pandas_kwargs : optional Keyword arguments to pass to dask.dataframe.from_pandas **kwargs Keyword arguments passed to Dask DataFrame.apply """ raiseifnot(from_pandas, "The dask (dataframe) package is not installed") from_pandas_kwargs = from_pandas_kwargs or {} set_missing_key(from_pandas_kwargs, "chunksize", 500) set_missing_key(from_pandas_kwargs, "sort", False) ddf = from_pandas(df, **from_pandas_kwargs) for column in ddf.columns: ddf[column] = ddf[column].apply(func, meta=(column, ddf[column].dtype), **kwargs) df = ddf.compute() self.push(df)
def run(self, df, func, threads=False, **kwargs): """Use Swifter apply() on a DataFrame Parameters ---------- df : pandas.DataFrame The pandas DataFrame to apply func to func : callable A callable that will be passed to df.swifter.apply threads : bool If true use the "threads" scheduler, else "processes" **kwargs Keyword arguments passed to Dask df.swifter.apply """ raiseifnot(swifter, "The swifter package is not installed") if threads: df.swifter.set_dask_scheduler(scheduler="threads") else: df.swifter.set_dask_scheduler(scheduler="processes") for column in df.columns: df[column] = df[column].swifter.apply(func, **kwargs) self.push(df)
def update_context(self, context): """Update the context dict for this Node""" raiseifnot( isinstance(context, dict), "Context must be dict-like, got %s" % type(context), ) self.context.update(context)
def run(self, df_or_dict, f, push_file=False, dry_run=False, **kwargs): """Use Pandas to_excel to output a DataFrame Parameters ---------- df_or_dict DataFrame or dict of DataFrames to load to an Excel file. In the case of a dict the keys will be the sheet names. f : file or buffer File to write the DataFrame to push_file : bool, optional If true, push the file forward instead of the data dry_run : bool, optional If true, skip actually loading the data **kwargs Keyword arguments passed to DataFrame.to_excel """ raiseifnot(pd, "Please install Pandas to use this class") if dry_run: warn("dry_run=True, skipping load in %s.run" % self.__class__.__name__) else: if isinstance(df_or_dict, dict): with pd.ExcelWriter(f) as writer: for sheet_name, df in df_or_dict.items(): df.to_excel(writer, sheet_name=sheet_name) else: df_or_dict.to_excel(f, **kwargs) if push_file: self.push(f) else: self.push(df_or_dict)
def run(self, df, conn, table, push_table=False, dry_run=False, **kwargs): """Use Pandas to_sql to output a DataFrame Parameters ---------- df : pandas.DataFrame DataFrame to load to a SQL table conn Database connection table : str Name of a table to write the data to push_table : bool, optional If true, push the table forward instead of the data dry_run : bool, optional If true, skip actually loading the data **kwargs Keyword arguments passed to DataFrame.to_sql """ raiseifnot(pd, "Please install Pandas to use this class") if dry_run: warn("dry_run=True, skipping load in %s.run" % self.__class__.__name__) else: df.to_sql(table, conn, **kwargs) if push_table: self.push(table) else: self.push(df)
def run(self, df, conn, schema=None, dry_run=False, **kwargs): """Use Pandas to_sql to output a DataFrame to a temporary table. Push a reference to the temp table forward. Parameters ---------- df : pandas.DataFrame DataFrame to load to a SQL table conn Database connection schema : str, optional schema to create the temp table in dry_run : bool, optional If true, skip actually loading the data **kwargs Keyword arguments passed to DataFrame.to_sql """ raiseifnot(pd, "Please install Pandas to use this class") raiseif( isinstance(conn, sqlite3.Connection), "sqlite3 connections not supported due to bug in Pandas' has_table()", ) table = get_temp_table(conn, df, schema=schema, create=True) if dry_run: warn("dry_run=True, skipping load in %s.run" % self.__class__.__name__) else: df.to_sql(table.name, conn, if_exists="append", **kwargs) self.push(table.name)
def run(self, df, f, push_file=False, dry_run=False, **kwargs): """Use Pandas to_html to output a DataFrame Parameters ---------- df : pandas.DataFrame DataFrame to load to an HTML file f : file or buffer File to write the DataFrame to push_file : bool, optional If true, push the file forward instead of the data dry_run : bool, optional If true, skip actually loading the data **kwargs Keyword arguments passed to DataFrame.to_html """ raiseifnot(pd, "Please install Pandas to use this class") if dry_run: warn("dry_run=True, skipping load in %s.run" % self.__class__.__name__) else: df.to_html(f, **kwargs) if push_file: self.push(f) else: self.push(df)
def check_conn(self, conn): """Make sure the object is a valid SQL connection""" raiseifnot( hasattr(conn, "cursor") or is_sqlalchemy_conn(conn), "Connection must have a cursor() method or be a SQLAlchemy connection: %s" % conn, )
def run(self, data, drop=None, **transforms): """Rename/replace keys in an iterable of dicts Parameters ---------- data Data to process. Expected to be a list/iterable of dict rows. drop : list, optional A list of keys to drop after transformations are complete. **transforms key->value pairs used to populate columns of each dict row. If the value is a callable it is expected to take the row as input and return the value to fill in for the key. """ drop = drop or [] raiseifnot( isinstance(drop, (list, tuple)), "drop argument must be a list/tuple of keys to drop", ) for row in data: raiseifnot(isinstance(row, dict), "Dict rows expected, got %s" % type(row)) for key, value in transforms.items(): if callable(value): row[key] = value(row) else: row[key] = value for key in drop: del row[key] self.push(data)
def assert_arg_present(custom_arg, arg_name): raiseifnot( arg_name in script_args, ("Custom arg %s with dest=%s maps to node arg=%s " "which is not in the script arg list. Check for " "conflicting args that cover the same node arg." % (custom_arg.name, custom_arg.dest, arg_name)), )
def __init__(self, queue, *args, **kwargs): raiseifnot(Queue, "Please install 'rq' to use RQParaGlider") raiseifnot( isinstance(queue, Queue), "The first argument to RQParaGlider must be a Queue", ) self.queue = queue super().__init__(*args, **kwargs)
def __init__(self, *args, **kwargs): raiseifnot( self.allowed_conn_types and isinstance(self.allowed_conn_types, (list, tuple)), ("%s.allowed_conn_types must be a list or tuple of connection types" % self.__class__.__name__), ) super().__init__(*args, **kwargs)
def __init__(self, consume_task, *args, **kwargs): raiseifnot(Celery, "Please install Celery to use CeleryParaGlider") raiseifnot( isinstance(consume_task, Task), ("The first argument to CeleryParaGlider must be a registered " "celery task that mirrors consume()"), ) self.consume_task = consume_task super().__init__(*args, **kwargs)
def begin(self): conn = self.context.get("conn", None) or self.global_state.get( "conn", None) raiseifnot( conn, "%s requires a conn argument in context or global state" % self.__class__.__name__, ) self.check_conn(conn)
def run(self, data, node_init, glider_kwargs=None, consume_kwargs=None): """Build a Glider from the input parameters and call consume Note ---- Currently node_init is limited in the types of DAGs it can represent. Grouping nodes as sublists to form a layer is supported. Parameters ---------- data data to process with the built glider node_init An iterable of nodes, where nodes are represented as JSON-serializable dicts and optionally grouped as layers with sublists. glider_kwargs : dict, optional Keyword arguments to pass to the constructed Glider. consume_kwargs : type, optional Keyword arguments to pass to consume() """ nodes = None for node_init_layer in node_init: if not isinstance(node_init_layer, (list, tuple)): node_init_layer = [node_init_layer] node_layer = [] for node_info in node_init_layer: # TODO: this only supports a single level of node tree evaluation raiseifnot(isinstance(node_info, dict), "Node info must be in dict format") node_args = node_info["args"] node_kwargs = node_info.get("kwargs", {}) cls = import_object(node_info["class_name"]) node = cls(*node_args, **node_kwargs) node_layer.append(node) if len(node_layer) == 1: node_layer = node_layer[0] if nodes is None: nodes = node_layer continue nodes = nodes | node_layer glider = Glider(nodes, **(glider_kwargs or {})) glider.consume(data, **(consume_kwargs or {}))
def run(self, data, func, queue=None, queue_name="default", redis_conn=None, push_type=PushTypes.Async, poll_sleep=POLL_SLEEP, timeout=None, **kwargs): """Execute func on data using Redis Queue Parameters ---------- data Data to process func : callable Function to execute using Redis Queue queue : Queue, optional An rq Queue object queue_name : str, optional When creating a queue, the name of the queue to use redis_conn : type, optional When creating a queue, the redis connection to use push_type : type, optional If "async", push the Job immediately. If "input", push the input data immediately after task submission. If "result", collect the task result synchronously and push it. poll_sleep : int or float, optional If waiting for the result, sleep this many seconds between polls timeout : int or float, optional If waiting for result, raise an exception if polling for all results takes longer than timeout seconds. **kwargs Keyword arguments to pass to enqueue() """ raiseifnot(queue or redis_conn, "One of 'queue' or 'redis_conn' must be specified") if not queue: queue = Queue(queue_name, connection=redis_conn) job = queue.enqueue(func, args=(data, ), **kwargs) if push_type == PushTypes.Async: self.push(job) elif push_type == PushTypes.Input: self.push(data) elif push_type == PushTypes.Result: self.push(get_async_result(job, timeout=timeout)) else: raise AssertionError("Invalid push_type: %s" % push_type)
def run(self, f, **kwargs): """Extract data for input file and push as a DataFrame Parameters ---------- f file or buffer to be passed to pandas.read_html **kwargs kwargs to be passed to pandas.read_html """ raiseifnot(pd, "Please install Pandas to use this class") df_list = pd.read_html(f, **kwargs) self.push(df_list)
def run(self, f, **kwargs): """Extract data for input file and push as a DataFrame Parameters ---------- f file or buffer to be passed to pandas.read_csv **kwargs kwargs to be passed to pandas.read_csv """ raiseifnot(pd, "Please install Pandas to use this class") df = pd.read_csv(f, **kwargs) self.do_push(df, chunksize=kwargs.get("chunksize", None))
def get_bulk_statement(self, conn, stmt_type, table, rows, odku=False): """Get a bulk execution SQL statement Parameters ---------- conn A SQL database connection object stmt_type : str Type of SQL statement to use (REPLACE, INSERT, etc.) table : str name of a SQL table rows An iterable of dict rows. The first row is used to determine column names. odku : bool or list, optional If true, add ON DUPLICATE KEY UPDATE clause for all columns. If a list then only add it for the specified columns. **Note:** Backend support for this varies. Returns ------- A SQL bulk load query of the given stmt_type """ if is_sqlalchemy_conn(conn): return get_bulk_statement(stmt_type, table, rows[0].keys(), dicts=False, odku=odku) if isinstance(conn, sqlite3.Connection): raiseifnot(isinstance(rows[0], sqlite3.Row), "Only sqlite3.Row rows are supported") return get_bulk_statement( stmt_type, table, rows[0].keys(), dicts=False, value_string="?", odku=odku, ) raiseif( isinstance(rows[0], tuple), "Dict rows expected, got tuple. Please use a dict cursor.", ) return get_bulk_statement(stmt_type, table, rows[0].keys(), odku=odku)
def run(self, df, method, **kwargs): """Helper to execute any pandas DataFrame method Parameters ---------- df : pandas.DataFrame DataFrame object used to run the method method : str A name of a valid DataFrame method **kwargs Arguments to pass to the DataFrame method """ m = getattr(df, method, None) raiseifnot(m and callable(m), "Invalid DataFrame method: %s" % m) self.push(m(**kwargs))
def run(self, df, func, **kwargs): """Use applymap() on a DataFrame Parameters ---------- df : pandas.DataFrame The pandas DataFrame to apply func to func : callable A callable that will be passed to df.applymap **kwargs Keyword arguments passed to applymap """ raiseifnot(pd, "Please install Pandas to use this class") df = df.applymap(func, **kwargs) self.push(df)
def run(self, sql, conn, **kwargs): """Extract data for input query and push as a DataFrame Parameters ---------- sql SQL query to pass to pandas.read_sql conn A SQL database connection **kwargs kwargs to be passed to pandas.read_sql """ raiseifnot(pd, "Please install Pandas to use this class") df = pd.read_sql(sql, conn, **kwargs) self.do_push(df, kwargs.get("chunksize", None))
def run(self, f, **kwargs): """Extract data for input file and push as a DataFrame. This will push a DataFrame or dict of DataFrames in the case of reading multiple sheets from an Excel file. Parameters ---------- f file or buffer to be passed to pandas.read_excel **kwargs kwargs to be passed to pandas.read_excel """ raiseifnot(pd, "Please install Pandas to use this class") df_or_dict = pd.read_excel(f, **kwargs) self.do_push(df_or_dict)
def run(self, data, func): """Assert that a function returns a truthy value Parameters ---------- data Data to push if func(self, data) is truthy func : callable A callable that accepts (node, data) args and returns a truthy value if the assertion should pass. """ raiseifnot( func(self, data), "Data assertion failed\nnode: %s\nfunc: %s" % (self.name, func), ) self.push(data)
def _get_arg_node_name(self, arg): names = [] node_lookup = self.glider.get_node_lookup() for node in node_lookup.values(): if not arg.startswith(node.name + "_"): continue arg_base_name = arg[len(node.name) + 1:] if arg_base_name in node.run_args or arg_base_name in node.run_kwargs: names.append(node.name) raiseifnot( len(names) <= 1, "More than one node found for arg name %s: %s" % (arg, names), ) if not names: return None return names[0]
def run(self, table, conn, where=None, limit=None, **kwargs): """Extract data for input table and push as a DataFrame Parameters ---------- table : str SQL table to query conn A SQL database connection where : str, optional A SQL where clause limit : int, optional Limit to put in SQL limit clause **kwargs kwargs to be passed to pandas.read_sql """ raiseifnot(pd, "Please install Pandas to use this class") sql = build_table_select(table, where=where, limit=limit) df = pd.read_sql(sql, conn, **kwargs) self.do_push(df, kwargs.get("chunksize", None))
def _push(self, data): raiseifnot(delayed, "Please install dask (delayed) to use DaskDelayedPush") if self._logging == "output": self._write_log(data) raiseif( "executor_kwargs" in self.context, "%s does not currently support executor_kwargs" % self.__class__, ) lazy = [] if self.context.get("split", False): splits = np.array_split(data, len(self._downstream_nodes)) for i, downstream in enumerate(self._downstream_nodes): lazy.append(delayed(downstream._process)(splits[i])) else: for downstream in self._downstream_nodes: lazy.append(delayed(downstream._process)(data)) compute(lazy)