Exemple #1
0
 def check_conn(self, conn):
     """Check the database connection"""
     raiseifnot(
         self._is_allowed_conn(conn),
         "Connection type %s is not in allowed types: %s" %
         (type(conn), self.allowed_conn_types),
     )
Exemple #2
0
    def commit(self, obj):
        """Commit any currently active transactions"""

        if hasattr(obj, "commit"):
            obj.commit()
        elif is_sqlalchemy_conn(obj):
            # Hack: I don't want to have to pass around the transaction
            # between nodes since that requirement is really specific to
            # SQLAlchemy, and SQLAlchemy doesn't seem to provide a standard
            # way of obtaining the current transaction, so this approach is
            # lifted from the SQLAlchemy internals.
            raiseifnot(
                hasattr(obj, "_Connection__transaction"),
                "Could not find transaction attribute on SQLAlchemy object: %s"
                % obj,
            )
            if getattr(obj, "_Connection__transaction", None):
                obj._Connection__transaction.commit()
            else:
                # SQLAlchemy connections autocommit by default, so we assume
                # that happened.
                pass
        else:
            raise AssertionError(
                "Could not determine how to commit with object: %s" % obj)
Exemple #3
0
    def create_like(self, conn, cursor, table, like_table, drop=False):
        """Create a table like another table, optionally trying to drop
        `table` first"""
        table = escape_string(str(table).strip("`"))
        like_table = escape_string(str(like_table).strip("`"))

        if drop:
            drop_sql = "drop table if exists %s" % table
            dbg(drop_sql)
            self.execute(conn, cursor, drop_sql)

        if isinstance(conn, sqlite3.Connection):
            get_create_sql = (
                "SELECT sql FROM sqlite_master WHERE type='table' AND name=?")
            qr = self.execute(conn,
                              cursor,
                              get_create_sql,
                              params=(like_table, ))
            row = qr.fetchone()
            raiseifnot(isinstance(row, sqlite3.Row),
                       "Only sqlite3.Row rows are supported")
            create_sql = row["sql"].replace(like_table, table)
        else:
            # Assume this syntax works with most other SQL databases
            create_sql = "create table %s like %s" % (table, like_table)

        dbg(create_sql)
        self.execute(conn, cursor, create_sql)
Exemple #4
0
    def run(self, df, func, from_pandas_kwargs=None, **kwargs):
        """Convert to dask dataframe and use apply()

        NOTE: it may be more efficient to not convert to/from Dask Dataframe
        in this manner depending on the pipeline

        Parameters
        ----------
        df : pandas.DataFrame
            The pandas DataFrame to apply func to
        func : callable
            A callable that will be passed to Dask DataFrame.apply
        from_pandas_kwargs : optional
            Keyword arguments to pass to dask.dataframe.from_pandas
        **kwargs
            Keyword arguments passed to Dask DataFrame.apply

        """
        raiseifnot(from_pandas,
                   "The dask (dataframe) package is not installed")
        from_pandas_kwargs = from_pandas_kwargs or {}
        set_missing_key(from_pandas_kwargs, "chunksize", 500)
        set_missing_key(from_pandas_kwargs, "sort", False)
        ddf = from_pandas(df, **from_pandas_kwargs)
        for column in ddf.columns:
            ddf[column] = ddf[column].apply(func,
                                            meta=(column, ddf[column].dtype),
                                            **kwargs)
        df = ddf.compute()
        self.push(df)
Exemple #5
0
    def run(self, df, func, threads=False, **kwargs):
        """Use Swifter apply() on a DataFrame

        Parameters
        ----------
        df : pandas.DataFrame
            The pandas DataFrame to apply func to
        func : callable
            A callable that will be passed to df.swifter.apply
        threads : bool
            If true use the "threads" scheduler, else "processes"
        **kwargs
            Keyword arguments passed to Dask df.swifter.apply

        """
        raiseifnot(swifter, "The swifter package is not installed")

        if threads:
            df.swifter.set_dask_scheduler(scheduler="threads")
        else:
            df.swifter.set_dask_scheduler(scheduler="processes")

        for column in df.columns:
            df[column] = df[column].swifter.apply(func, **kwargs)
        self.push(df)
Exemple #6
0
 def update_context(self, context):
     """Update the context dict for this Node"""
     raiseifnot(
         isinstance(context, dict),
         "Context must be dict-like, got %s" % type(context),
     )
     self.context.update(context)
Exemple #7
0
    def run(self, df_or_dict, f, push_file=False, dry_run=False, **kwargs):
        """Use Pandas to_excel to output a DataFrame

        Parameters
        ----------
        df_or_dict
            DataFrame or dict of DataFrames to load to an Excel file. In the
            case of a dict the keys will be the sheet names.
        f : file or buffer
            File to write the DataFrame to
        push_file : bool, optional
            If true, push the file forward instead of the data
        dry_run : bool, optional
            If true, skip actually loading the data
        **kwargs
            Keyword arguments passed to DataFrame.to_excel

        """
        raiseifnot(pd, "Please install Pandas to use this class")

        if dry_run:
            warn("dry_run=True, skipping load in %s.run" % self.__class__.__name__)
        else:
            if isinstance(df_or_dict, dict):
                with pd.ExcelWriter(f) as writer:
                    for sheet_name, df in df_or_dict.items():
                        df.to_excel(writer, sheet_name=sheet_name)
            else:
                df_or_dict.to_excel(f, **kwargs)

        if push_file:
            self.push(f)
        else:
            self.push(df_or_dict)
Exemple #8
0
    def run(self, df, conn, table, push_table=False, dry_run=False, **kwargs):
        """Use Pandas to_sql to output a DataFrame

        Parameters
        ----------
        df : pandas.DataFrame
            DataFrame to load to a SQL table
        conn
            Database connection
        table : str
            Name of a table to write the data to
        push_table : bool, optional
            If true, push the table forward instead of the data
        dry_run : bool, optional
            If true, skip actually loading the data
        **kwargs
            Keyword arguments passed to DataFrame.to_sql

        """
        raiseifnot(pd, "Please install Pandas to use this class")

        if dry_run:
            warn("dry_run=True, skipping load in %s.run" % self.__class__.__name__)
        else:
            df.to_sql(table, conn, **kwargs)

        if push_table:
            self.push(table)
        else:
            self.push(df)
Exemple #9
0
    def run(self, df, conn, schema=None, dry_run=False, **kwargs):
        """Use Pandas to_sql to output a DataFrame to a temporary table. Push a
        reference to the temp table forward.

        Parameters
        ----------
        df : pandas.DataFrame
            DataFrame to load to a SQL table
        conn
            Database connection
        schema : str, optional
            schema to create the temp table in
        dry_run : bool, optional
            If true, skip actually loading the data
        **kwargs
            Keyword arguments passed to DataFrame.to_sql

        """
        raiseifnot(pd, "Please install Pandas to use this class")

        raiseif(
            isinstance(conn, sqlite3.Connection),
            "sqlite3 connections not supported due to bug in Pandas' has_table()",
        )

        table = get_temp_table(conn, df, schema=schema, create=True)
        if dry_run:
            warn("dry_run=True, skipping load in %s.run" % self.__class__.__name__)
        else:
            df.to_sql(table.name, conn, if_exists="append", **kwargs)

        self.push(table.name)
Exemple #10
0
    def run(self, df, f, push_file=False, dry_run=False, **kwargs):
        """Use Pandas to_html to output a DataFrame

        Parameters
        ----------
        df : pandas.DataFrame
            DataFrame to load to an HTML file
        f : file or buffer
            File to write the DataFrame to
        push_file : bool, optional
            If true, push the file forward instead of the data
        dry_run : bool, optional
            If true, skip actually loading the data
        **kwargs
            Keyword arguments passed to DataFrame.to_html

        """
        raiseifnot(pd, "Please install Pandas to use this class")

        if dry_run:
            warn("dry_run=True, skipping load in %s.run" % self.__class__.__name__)
        else:
            df.to_html(f, **kwargs)

        if push_file:
            self.push(f)
        else:
            self.push(df)
Exemple #11
0
 def check_conn(self, conn):
     """Make sure the object is a valid SQL connection"""
     raiseifnot(
         hasattr(conn, "cursor") or is_sqlalchemy_conn(conn),
         "Connection must have a cursor() method or be a SQLAlchemy connection: %s"
         % conn,
     )
Exemple #12
0
    def run(self, data, drop=None, **transforms):
        """Rename/replace keys in an iterable of dicts

        Parameters
        ----------
        data
            Data to process. Expected to be a list/iterable of dict rows.
        drop : list, optional
            A list of keys to drop after transformations are complete.
        **transforms
            key->value pairs used to populate columns of each dict row. If the
            value is a callable it is expected to take the row as input and
            return the value to fill in for the key.

        """
        drop = drop or []
        raiseifnot(
            isinstance(drop, (list, tuple)),
            "drop argument must be a list/tuple of keys to drop",
        )

        for row in data:
            raiseifnot(isinstance(row, dict),
                       "Dict rows expected, got %s" % type(row))
            for key, value in transforms.items():
                if callable(value):
                    row[key] = value(row)
                else:
                    row[key] = value

            for key in drop:
                del row[key]

        self.push(data)
Exemple #13
0
 def assert_arg_present(custom_arg, arg_name):
     raiseifnot(
         arg_name in script_args,
         ("Custom arg %s with dest=%s maps to node arg=%s "
          "which is not in the script arg list. Check for "
          "conflicting args that cover the same node arg." %
          (custom_arg.name, custom_arg.dest, arg_name)),
     )
Exemple #14
0
 def __init__(self, queue, *args, **kwargs):
     raiseifnot(Queue, "Please install 'rq' to use RQParaGlider")
     raiseifnot(
         isinstance(queue, Queue),
         "The first argument to RQParaGlider must be a Queue",
     )
     self.queue = queue
     super().__init__(*args, **kwargs)
Exemple #15
0
 def __init__(self, *args, **kwargs):
     raiseifnot(
         self.allowed_conn_types and isinstance(self.allowed_conn_types,
                                                (list, tuple)),
         ("%s.allowed_conn_types must be a list or tuple of connection types"
          % self.__class__.__name__),
     )
     super().__init__(*args, **kwargs)
Exemple #16
0
 def __init__(self, consume_task, *args, **kwargs):
     raiseifnot(Celery, "Please install Celery to use CeleryParaGlider")
     raiseifnot(
         isinstance(consume_task, Task),
         ("The first argument to CeleryParaGlider must be a registered "
          "celery task that mirrors consume()"),
     )
     self.consume_task = consume_task
     super().__init__(*args, **kwargs)
Exemple #17
0
 def begin(self):
     conn = self.context.get("conn", None) or self.global_state.get(
         "conn", None)
     raiseifnot(
         conn,
         "%s requires a conn argument in context or global state" %
         self.__class__.__name__,
     )
     self.check_conn(conn)
Exemple #18
0
        def run(self,
                data,
                node_init,
                glider_kwargs=None,
                consume_kwargs=None):
            """Build a Glider from the input parameters and call consume

            Note
            ----
            Currently node_init is limited in the types of DAGs it can
            represent. Grouping nodes as sublists to form a layer is
            supported.

            Parameters
            ----------
            data
                data to process with the built glider
            node_init
                An iterable of nodes, where nodes are represented as
                JSON-serializable dicts and optionally grouped as layers with
                sublists.
            glider_kwargs : dict, optional
                Keyword arguments to pass to the constructed Glider.
            consume_kwargs : type, optional
                Keyword arguments to pass to consume()

            """
            nodes = None

            for node_init_layer in node_init:
                if not isinstance(node_init_layer, (list, tuple)):
                    node_init_layer = [node_init_layer]

                node_layer = []
                for node_info in node_init_layer:
                    # TODO: this only supports a single level of node tree evaluation
                    raiseifnot(isinstance(node_info, dict),
                               "Node info must be in dict format")
                    node_args = node_info["args"]
                    node_kwargs = node_info.get("kwargs", {})
                    cls = import_object(node_info["class_name"])
                    node = cls(*node_args, **node_kwargs)
                    node_layer.append(node)

                if len(node_layer) == 1:
                    node_layer = node_layer[0]

                if nodes is None:
                    nodes = node_layer
                    continue
                nodes = nodes | node_layer

            glider = Glider(nodes, **(glider_kwargs or {}))
            glider.consume(data, **(consume_kwargs or {}))
Exemple #19
0
    def run(self,
            data,
            func,
            queue=None,
            queue_name="default",
            redis_conn=None,
            push_type=PushTypes.Async,
            poll_sleep=POLL_SLEEP,
            timeout=None,
            **kwargs):
        """Execute func on data using Redis Queue

        Parameters
        ----------
        data
            Data to process
        func : callable
            Function to execute using Redis Queue
        queue : Queue, optional
            An rq Queue object
        queue_name : str, optional
            When creating a queue, the name of the queue to use
        redis_conn : type, optional
            When creating a queue, the redis connection to use
        push_type : type, optional
            If "async", push the Job immediately.
            If "input", push the input data immediately after task submission.
            If "result", collect the task result synchronously and push it.
        poll_sleep : int or float, optional
            If waiting for the result, sleep this many seconds between polls
        timeout : int or float, optional
            If waiting for result, raise an exception if polling for all
            results takes longer than timeout seconds.
        **kwargs
            Keyword arguments to pass to enqueue()

        """
        raiseifnot(queue or redis_conn,
                   "One of 'queue' or 'redis_conn' must be specified")
        if not queue:
            queue = Queue(queue_name, connection=redis_conn)

        job = queue.enqueue(func, args=(data, ), **kwargs)

        if push_type == PushTypes.Async:
            self.push(job)
        elif push_type == PushTypes.Input:
            self.push(data)
        elif push_type == PushTypes.Result:
            self.push(get_async_result(job, timeout=timeout))
        else:
            raise AssertionError("Invalid push_type: %s" % push_type)
Exemple #20
0
    def run(self, f, **kwargs):
        """Extract data for input file and push as a DataFrame

        Parameters
        ----------
        f
            file or buffer to be passed to pandas.read_html
        **kwargs
            kwargs to be passed to pandas.read_html

        """
        raiseifnot(pd, "Please install Pandas to use this class")
        df_list = pd.read_html(f, **kwargs)
        self.push(df_list)
Exemple #21
0
    def run(self, f, **kwargs):
        """Extract data for input file and push as a DataFrame

        Parameters
        ----------
        f
            file or buffer to be passed to pandas.read_csv
        **kwargs
            kwargs to be passed to pandas.read_csv

        """
        raiseifnot(pd, "Please install Pandas to use this class")
        df = pd.read_csv(f, **kwargs)
        self.do_push(df, chunksize=kwargs.get("chunksize", None))
Exemple #22
0
    def get_bulk_statement(self, conn, stmt_type, table, rows, odku=False):
        """Get a bulk execution SQL statement

        Parameters
        ----------
        conn
            A SQL database connection object
        stmt_type : str
            Type of SQL statement to use (REPLACE, INSERT, etc.)
        table : str
            name of a SQL table
        rows
            An iterable of dict rows. The first row is used to determine
            column names.
        odku : bool or list, optional
            If true, add ON DUPLICATE KEY UPDATE clause for all columns. If a
            list then only add it for the specified columns. **Note:** Backend
            support for this varies.

        Returns
        -------
        A SQL bulk load query of the given stmt_type

        """
        if is_sqlalchemy_conn(conn):
            return get_bulk_statement(stmt_type,
                                      table,
                                      rows[0].keys(),
                                      dicts=False,
                                      odku=odku)

        if isinstance(conn, sqlite3.Connection):
            raiseifnot(isinstance(rows[0], sqlite3.Row),
                       "Only sqlite3.Row rows are supported")
            return get_bulk_statement(
                stmt_type,
                table,
                rows[0].keys(),
                dicts=False,
                value_string="?",
                odku=odku,
            )

        raiseif(
            isinstance(rows[0], tuple),
            "Dict rows expected, got tuple. Please use a dict cursor.",
        )
        return get_bulk_statement(stmt_type, table, rows[0].keys(), odku=odku)
Exemple #23
0
    def run(self, df, method, **kwargs):
        """Helper to execute any pandas DataFrame method

        Parameters
        ----------
        df : pandas.DataFrame
            DataFrame object used to run the method
        method : str
            A name of a valid DataFrame method
        **kwargs
            Arguments to pass to the DataFrame method

        """
        m = getattr(df, method, None)
        raiseifnot(m and callable(m), "Invalid DataFrame method: %s" % m)
        self.push(m(**kwargs))
Exemple #24
0
    def run(self, df, func, **kwargs):
        """Use applymap() on a DataFrame

        Parameters
        ----------
        df : pandas.DataFrame
            The pandas DataFrame to apply func to
        func : callable
            A callable that will be passed to df.applymap
        **kwargs
            Keyword arguments passed to applymap

        """
        raiseifnot(pd, "Please install Pandas to use this class")
        df = df.applymap(func, **kwargs)
        self.push(df)
Exemple #25
0
    def run(self, sql, conn, **kwargs):
        """Extract data for input query and push as a DataFrame

        Parameters
        ----------
        sql
            SQL query to pass to pandas.read_sql
        conn
            A SQL database connection
        **kwargs
            kwargs to be passed to pandas.read_sql

        """
        raiseifnot(pd, "Please install Pandas to use this class")
        df = pd.read_sql(sql, conn, **kwargs)
        self.do_push(df, kwargs.get("chunksize", None))
Exemple #26
0
    def run(self, f, **kwargs):
        """Extract data for input file and push as a DataFrame. This will push a
        DataFrame or dict of DataFrames in the case of reading multiple sheets
        from an Excel file.

        Parameters
        ----------
        f
            file or buffer to be passed to pandas.read_excel
        **kwargs
            kwargs to be passed to pandas.read_excel

        """
        raiseifnot(pd, "Please install Pandas to use this class")
        df_or_dict = pd.read_excel(f, **kwargs)
        self.do_push(df_or_dict)
Exemple #27
0
    def run(self, data, func):
        """Assert that a function returns a truthy value

        Parameters
        ----------
        data
            Data to push if func(self, data) is truthy
        func : callable
            A callable that accepts (node, data) args and returns a truthy value if
            the assertion should pass.

        """
        raiseifnot(
            func(self, data),
            "Data assertion failed\nnode: %s\nfunc: %s" % (self.name, func),
        )
        self.push(data)
Exemple #28
0
    def _get_arg_node_name(self, arg):
        names = []
        node_lookup = self.glider.get_node_lookup()
        for node in node_lookup.values():
            if not arg.startswith(node.name + "_"):
                continue

            arg_base_name = arg[len(node.name) + 1:]
            if arg_base_name in node.run_args or arg_base_name in node.run_kwargs:
                names.append(node.name)

        raiseifnot(
            len(names) <= 1,
            "More than one node found for arg name %s: %s" % (arg, names),
        )
        if not names:
            return None
        return names[0]
Exemple #29
0
    def run(self, table, conn, where=None, limit=None, **kwargs):
        """Extract data for input table and push as a DataFrame

        Parameters
        ----------
        table : str
            SQL table to query
        conn
            A SQL database connection
        where : str, optional
            A SQL where clause
        limit : int, optional
            Limit to put in SQL limit clause
        **kwargs
            kwargs to be passed to pandas.read_sql

        """
        raiseifnot(pd, "Please install Pandas to use this class")
        sql = build_table_select(table, where=where, limit=limit)
        df = pd.read_sql(sql, conn, **kwargs)
        self.do_push(df, kwargs.get("chunksize", None))
Exemple #30
0
    def _push(self, data):
        raiseifnot(delayed,
                   "Please install dask (delayed) to use DaskDelayedPush")

        if self._logging == "output":
            self._write_log(data)

        raiseif(
            "executor_kwargs" in self.context,
            "%s does not currently support executor_kwargs" % self.__class__,
        )

        lazy = []
        if self.context.get("split", False):
            splits = np.array_split(data, len(self._downstream_nodes))
            for i, downstream in enumerate(self._downstream_nodes):
                lazy.append(delayed(downstream._process)(splits[i]))
        else:
            for downstream in self._downstream_nodes:
                lazy.append(delayed(downstream._process)(data))

        compute(lazy)