Beispiel #1
0
    def end(self):
        """Do the push once all Futures results are in"""
        dbg("Waiting for %d async futures..." % len(self.results))
        timeout = self.context.get("timeout", None)
        close = self.context.get("close", None)

        loop = asyncio.get_event_loop()

        try:
            done, pending = loop.run_until_complete(
                asyncio.wait(self.results, timeout=timeout))
            if timeout and pending:
                cancel_asyncio_tasks(pending,
                                     loop,
                                     cancel_timeout=ASYNCIO_CANCEL_TIMEOUT)
                raise asyncio.TimeoutError("%d/%d tasks pending" %
                                           (len(pending), len(self.results)))
            results = [task.result() for task in done]
        finally:
            if close:
                loop.close()

        if results and self.context.get("flatten", False):
            results = flatten(results)
        self.push(results)
Beispiel #2
0
 def end(self):
     """Do the push once all results are in"""
     dbg("Waiting for %d RQ job(s)..." % len(self.results))
     results = get_async_results(self.results)
     if results and self.context.get("flatten", False):
         results = flatten(results)
     self.push(results)
Beispiel #3
0
    def consume(self,
                data=None,
                cleanup=None,
                split_count=None,
                synchronous=False,
                timeout=None,
                **node_contexts):
        """Setup node contexts and consume data with the pipeline

        Parameters
        ----------
        data : iterable, optional
            Iterable of data to consume
        cleanup : dict, optional
            A mapping of arg names to clean up functions to be run after
            data processing is complete.
        split_count : int, optional
            How many slices to split the data into for parallel processing. Default
            is to inspect the celery app and set split_count = worker count.
        synchronous : bool, optional
            If False, return AsyncResults. If True, wait for tasks to complete and
            return their results, if any.
        timeout : int or float, optional
            If waiting for results, pass this as timeout to AsyncResult.get().
        **node_contexts
            Keyword arguments that are node_name->param_dict

        """
        if not split_count:
            dbg("determining split count from app celery worker count")
            app_stats = self.consume_task.app.control.inspect().stats()
            split_count = len(app_stats.keys())

        split_count = split_count_helper(data, split_count)
        if data is None:
            splits = [None for s in range(split_count)]
        else:
            splits = divide_data(data, split_count)

        dbg("%s: data len: %s, splits: %d" %
            (self.__class__.__name__, size(data, "n/a"), split_count))

        async_results = []
        for split in splits:
            async_results.append(
                self.consume_task.delay(self.pipeline,
                                        split,
                                        cleanup=cleanup,
                                        **node_contexts))

        if synchronous:
            results = []
            for async_result in async_results:
                try:
                    results.append(async_result.get(timeout=timeout))
                finally:
                    async_result.forget()
            return results

        return async_results
Beispiel #4
0
    def create_like(self, conn, cursor, table, like_table, drop=False):
        """Create a table like another table, optionally trying to drop
        `table` first"""
        table = escape_string(str(table).strip("`"))
        like_table = escape_string(str(like_table).strip("`"))

        if drop:
            drop_sql = "drop table if exists %s" % table
            dbg(drop_sql)
            self.execute(conn, cursor, drop_sql)

        if isinstance(conn, sqlite3.Connection):
            get_create_sql = (
                "SELECT sql FROM sqlite_master WHERE type='table' AND name=?")
            qr = self.execute(conn,
                              cursor,
                              get_create_sql,
                              params=(like_table, ))
            row = qr.fetchone()
            raiseifnot(isinstance(row, sqlite3.Row),
                       "Only sqlite3.Row rows are supported")
            create_sql = row["sql"].replace(like_table, table)
        else:
            # Assume this syntax works with most other SQL databases
            create_sql = "create table %s like %s" % (table, like_table)

        dbg(create_sql)
        self.execute(conn, cursor, create_sql)
Beispiel #5
0
    def consume(self,
                data=None,
                cleanup=None,
                split_count=None,
                synchronous=False,
                timeout=None,
                **node_contexts):
        """Setup node contexts and consume data with the pipeline

        Parameters
        ----------
        data : iterable, optional
            Iterable of data to consume
        cleanup : dict, optional
            A mapping of arg names to clean up functions to be run after
            data processing is complete.
        split_count : int, optional
            How many slices to split the data into for parallel processing. Default
            is the number of workers in the provided queue.
        synchronous : bool, optional
            If False, return Jobs. If True, wait for jobs to complete and
            return their results, if any.
        timeout : int or float, optional
            If waiting for results, raise an exception if polling for all
            results takes longer than timeout seconds.
        **node_contexts
            Keyword arguments that are node_name->param_dict

        """
        if not split_count:
            dbg("determining split count from rq worker count")
            workers = Worker.all(queue=self.queue)
            split_count = len(workers)

        split_count = split_count_helper(data, split_count)
        if data is None:
            splits = [None for s in range(split_count)]
        else:
            splits = divide_data(data, split_count)

        dbg("%s: data len: %s, splits: %d" %
            (self.__class__.__name__, size(data, "n/a"), split_count))

        async_results = []
        for split in splits:
            async_results.append(
                self.queue.enqueue(
                    rq_consume,
                    args=(self.pipeline, split),
                    kwargs=dict(cleanup=cleanup, **node_contexts),
                ))

        if synchronous:
            return get_async_results(async_results, timeout=timeout)

        return async_results
Beispiel #6
0
 def end(self):
     """Do the push once all Futures results are in"""
     dbg("Waiting for %d futures..." % len(self.results))
     timeout = self.context.get("timeout", None)
     results = []
     for future in as_completed(self.results, timeout=timeout):
         results.append(future.result())
     if results and self.context.get("flatten", False):
         results = flatten(results)
     self.push(results)
Beispiel #7
0
    def run(
        self,
        rows,
        conn,
        cursor=None,
        schema=None,
        commit=True,
        rollback=False,
        dry_run=False,
    ):
        """Create and bulk load a temp table

        Parameters
        ----------
        rows
            Iterable of rows to load to the table
        conn
            Database connection
        cursor : optional
            Database connection cursor
        schema : str, optional
            Schema to create temp table in
        commit : bool, optional
            If true try to commit the transaction. If your connection
            autocommits this will have no effect. If this is a SQLAlchemy
            connection and you are in a transaction, it will try to get a
            reference to the current transaction and call commit on that.
        rollback : bool, optional
            If true try to rollback the transaction on exceptions. Behavior
            may vary by backend DB library if you are not currently in a
            transaction.
        dry_run : bool, optional
            If true, skip actually loading the data

        """
        table = get_temp_table(conn, rows, create=True, schema=schema)
        sql = self.get_bulk_statement(conn, "REPLACE", table.name, rows)
        dbg("Loading %d rows\n%s" % (size(rows, "n/a"), sqlformat(sql)), indent="label")

        if dry_run:
            warn("dry_run=True, skipping load in %s.run" % self.__class__.__name__)
        else:
            if not cursor:
                cursor = self.get_sql_executor(conn)

            try:
                self.executemany(conn, cursor, sql, rows)
                if commit:
                    self.commit(conn)
            except:
                if rollback:
                    self.rollback(conn)
                raise

        self.push(table.name)
Beispiel #8
0
 def process(self, data):
     """Required method used by Consecution to process nodes"""
     arg_values, kwarg_values = self._get_run_arg_values()
     if self._log:
         print(format_msg(repr(data), label=self.name))
     else:
         dbg("size:%s %s" % (size(data), repr(data)), label=self.name)
     if self.run_requires_data:
         self._run(data, *arg_values, **kwarg_values)
     else:
         self._run(*arg_values, **kwarg_values)
Beispiel #9
0
 def end(self):
     """Do the push once all results are in"""
     dbg("Waiting for %d celery task(s)..." % len(self.results))
     result_set = ResultSet(self.results)
     results = result_set.get(
         timeout=self.context.get("timeout", None),
         propagate=self.context.get("propagate", True),
         interval=self.context.get("interval", 0.5),
     )
     result_set.forget()
     if results and self.context.get("flatten", False):
         results = flatten(results)
     self.push(results)
Beispiel #10
0
    def end(self):
        """Do the push once all Futures results are in.

        Warnings
        --------
        Dask futures will not work if you have closed your client connection!

        """
        dbg("Waiting for %d Dask futures..." % len(self.results))
        results = []
        for _, result in dask_as_completed(self.results, with_results=True):
            results.append(result)
        if results and self.context.get("flatten", False):
            results = pd.concat(results)
        self.push(results)
Beispiel #11
0
def get_async_results(async_results, timeout=None):
    """Poll for results """
    # TODO: Is there a better option than polling?
    start = time.time()

    while complete_count(async_results) < len(async_results):
        diff = time.time() - start
        if timeout and diff >= timeout:
            raise RQTimeoutException(
                "get_async_results timed out after %.3fs" % diff)

        dbg("Sleeping %.3fs..." % POLL_SLEEP)
        time.sleep(POLL_SLEEP)

    return [job.result for job in async_results]
Beispiel #12
0
    def transaction(self, conn, cursor=None):
        """Start a transaction. If conn is a SQLAlchemy conn return a
        reference to the transaction object, otherwise just return the conn
        which should have commit/rollback methods."""

        dbg("starting transaction: %s" % conn)
        if is_sqlalchemy_conn(conn):
            return conn.begin()

        # For SQLite and DBAPI connections we explicitly call begin.
        # https://docs.python.org/3/library/sqlite3.html#sqlite3-controlling-transactions
        if not cursor:
            cursor = self.get_sql_executor(conn)
        cursor.execute("BEGIN")
        return conn
Beispiel #13
0
    def rollback(self, obj):
        """Rollback any currently active transactions"""

        dbg("rolling back transaction: %s" % obj)
        if hasattr(obj, "rollback"):
            obj.rollback()
        elif is_sqlalchemy_conn(obj):
            # See note above about this hack
            raiseifnot(
                hasattr(obj, "_Connection__transaction"),
                "Could not find transaction attribute on SQLAlchemy object: %s"
                % obj,
            )
            if getattr(obj, "_Connection__transaction", None):
                obj._Connection__transaction.rollback()
            else:
                raise AssertionError(
                    "Trying to rollback a transaction but the SQLAlchemy "
                    "conn was not in a transaction. It may have "
                    "autocommitted.")
        else:
            raise AssertionError(
                "Could not determine how to rollback with object: %s" % obj)
Beispiel #14
0
def consume(pipeline, data, cleanup=None, **node_contexts):
    """Handles node contexts before/after calling pipeline.consume()

    Note
    ----
    It would have been better to subclass Pipeline and implement this logic
    right before/after the core consume() call, but there is a bug in pickle
    that prevents that from working with multiprocessing.

    """
    update_node_contexts(pipeline, node_contexts)
    try:
        contexts = get_node_contexts(pipeline)
        dbg("size=%s\n%s" % (size(data, "n/a"), pf(contexts)), indent="label")
        try:
            if data is None:
                return consume_none(pipeline)
            else:
                return pipeline.consume(iterize(data))
        finally:
            if cleanup:
                clean_up_nodes(cleanup, contexts)
    finally:
        reset_node_contexts(pipeline, node_contexts)
Beispiel #15
0
def clean_up_nodes(cleanup, contexts):
    """Call clean up functions for node context objects"""
    errors = []
    cleaned = set()

    # This block will clean any arg names that match regardless of node name
    removes = set()
    for node_name, context in contexts.items():
        for arg_name, arg_value in context.items():
            if arg_name in cleanup:
                cleaned.add((node_name, arg_name))
                removes.add(arg_name)
                func = cleanup[arg_name]
                try:
                    func(arg_value)
                except Exception as e:
                    dbg("Exception during clean up: %s" % str(e))

    for key in removes:
        del cleanup[key]

    # This block handles specific node_name/arg_name pairs
    for key, func in cleanup.items():
        parts = key.split("_")
        node_name = parts[0]
        arg_name = "_".join(parts[1:])

        if node_name not in contexts:
            errors.append("Could not clean up %s, invalid node name: %s" %
                          (key, node_name))
            continue

        if arg_name not in contexts[node_name]:
            errors.append(
                "Could not clean up %s, invalid node arg name: %s->%s" %
                (key, node_name, arg_name))
            continue

        if (node_name, arg_name) in cleaned:
            dbg("Skipping clean up for %s->%s, already cleaned" %
                (node_name, arg_name))
            continue

        ctx_value = contexts[node_name][arg_name]
        if not ctx_value:
            dbg("Skipping clean up for %s->%s, value is blank" %
                (node_name, arg_name))
            continue

        if isinstance(ctx_value, RuntimeContext):
            dbg("Skipping clean up for %s->%s, value is RuntimeContext object"
                % (node_name, arg_name))
            continue

        dbg("Executing clean up for %s->%s" % (node_name, arg_name))
        try:
            func(ctx_value)
        except Exception as e:
            errors.append("Failed to clean up %s->%s: %s" %
                          (node_name, arg_name, str(e)))

    if errors:
        raise Exception("Errors during clean_up: %s" % errors)
Beispiel #16
0
 def rename_tables(self, conn, cursor, renames):
     """Execute one or more table renames"""
     for t1, t2 in renames:
         sql = escape_string("ALTER TABLE %s RENAME TO %s" % (t1, t2))
         dbg(sql)
         self.execute(conn, cursor, sql)
Beispiel #17
0
    def run(
        self,
        data,
        frm=None,
        to=None,
        subject=None,
        body=None,
        html=None,
        attach_as="attachment",
        attachment_name=None,
        formatter=None,
        client=None,
        host=None,
        port=None,
        username=None,
        password=None,
        dry_run=False,
    ):
        """Load data to email via SMTP.

        Parameters
        ----------
        data
            EmailMessage or data to send. If the latter, the message will be
            created from the other node arguments.
        frm : str, optional
            The from email address
        to : str or list, optional
            A str or list of destination email addresses
        subject : str, optional
            The email subject
        body : str, optional
            The email text body
        html : str, optional
            The email html body
        attach_as : str
            Where to put the data in the email message if building the message
            from node arguments. Options: attachment, body, html.
        attachment_name: str, optional
            The file name to write the data to when attaching data to the
            email. The file extension will be used to infer the mimetype of
            the attachment. This should not be a full path as a temp directory
            will be created for this.
        formatter : callable
            A function to format and return a string from the input data if
            attach_as is set to "body" or "html".
        client : optional
            A connected smtplib.SMTP client
        host : str, optional
            The SMTP host to connect to if no client is provided
        port : int, optional
            The SMTP port to connect to if no client is provided
        username : str, optional
            The SMTP username for login if no client is provided
        password : str, optional
            The SMTP password for login if no client is provided
        dry_run : bool, optional
            If true, skip actually loading the data

        """

        if isinstance(data, EmailMessage):
            msg = data
        else:
            # Assume its data that needs to be converted to attachments and sent
            raiseifnot(
                frm and to and subject,
                "Node context must have frm/to/subject set to create an email msg",
            )
            raiseifnot(
                isinstance(data, str),
                "data must be passed as raw str content, got %s" % type(data),
            )

            attachments = None
            tmpdir = None

            if attach_as == "attachment":
                raiseifnot(
                    attachment_name,
                    "Must specify an attachment_name when attach_as = attachment",
                )
                tmpdir = tempfile.TemporaryDirectory()
                filename = tmpdir.name + "/" + attachment_name
                with open(filename, "w") as f:
                    f.write(data)
                attachments = [filename]
            else:
                fmt_data = formatter(data) if formatter else data
                if attach_as == "body":
                    body = (body or "") + fmt_data
                elif attach_as == "html":
                    html = (html or "") + fmt_data
                else:
                    raise AssertionError(
                        "Invalid attach_as value: %s, options: attachment, body, html"
                        % attach_as
                    )

            msg = create_email(
                frm, to, subject, body=body, html=html, attachments=attachments
            )

            if tmpdir:
                tmpdir.cleanup()

        if dry_run:
            warn("dry_run=True, skipping load in %s.run" % self.__class__.__name__)
        else:
            dbg("Sending msg %s to %s" % (msg["Subject"], msg["To"]))
            send_email(
                msg,
                client=client,
                host=host,
                port=port,
                username=username,
                password=password,
            )

        self.push(data)
Beispiel #18
0
 def drop_table(self, conn, cursor, table):
     """Drop tables all day long"""
     drop_sql = escape_string("drop table %s" % table)
     dbg(drop_sql)
     self.execute(conn, cursor, drop_sql)
Beispiel #19
0
    def run(self,
            criteria,
            sort=None,
            folder="INBOX",
            client=None,
            host=None,
            username=None,
            password=None,
            push_all=False,
            push_type="message",
            limit=None,
            **kwargs):
        """Extract data from an email inbox and push the data forward.

        Note
        ----
        Instances of IMAPClient are NOT thread safe. They should not be shared
        and accessed concurrently from multiple threads.

        Parameters
        ----------
        criteria : str or list
            Criteria argument passed to IMAPClient.search. See
            https://tools.ietf.org/html/rfc3501.html#section-6.4.4.
        sort : str or list, optional
            Sort criteria passed to IMAPClient.sort. Note that SORT is an
            extension to the IMAP4 standard so it may not be supported by all
            IMAP servers. See https://tools.ietf.org/html/rfc5256.
        folder : str, optional
            Folder to read emails from
        client : optional
            An established IMAPClient connection. If not present, the
            host/login information is required.
        host : str, optional
            The IMAP host to connect to
        username : str, optional
            The IMAP username for login
        password : str, optional
            The IMAP password for login
        push_all : bool, optional
            When true push all retrievd data/emails at once
        push_type : str, optional
            What type of data to extract and push from the emails. Options include:

                * **message**: push email.message.EmailMessage objects
                * **message_id**: push a list of message IDs that can be fetched
                * **all**: push a list of dict(message=<email.message.EmailMessages>, payload=<extracted payload>)
                * **body**: push a list of email bodies
                * **attachment**: push a list of attachments (an email with multiple attachments will be grouped in a sublist)
        limit : int, optional
            Limit to N rows
        **kwargs
            Keyword arguments to pass IMAPClient if not client is passed

        """
        data = []
        logout = False
        push_types = ["message_id", "message", "all", "body", "attachment"]

        if not client:
            raiseifnot(
                host and username and password,
                "Host/Username/Password required to create IMAPClient",
            )
            dbg("Logging into IMAPClient %s/%s" % (host, username))
            logout = True
            client = IMAPClient(host, **kwargs)
            client.login(username, password)

        try:
            client.select_folder(folder)
            if sort:
                messages = client.sort(sort, criteria=criteria)
            else:
                messages = client.search(criteria)
            dbg("Found %d email messages" % len(messages))

            if push_type == "message_id":
                if limit:
                    data = messages[:limit]
                else:
                    data = messages
            else:
                raiseifnot(
                    push_type in push_types,
                    "Unrecognized push_type: %s, options: %s" %
                    (push_type, push_types),
                )
                count = 0
                for msg_id, msg_data in client.fetch(messages,
                                                     ["RFC822"]).items():
                    raw = msg_data[b"RFC822"].decode("utf8")
                    msg = parser.Parser(policy=policy.default).parsestr(raw)

                    if push_type == "message":
                        data.append(msg)
                    else:
                        payload = extract_email_payload(msg)
                        if push_type == "body":
                            data.append(payload[0])
                        elif push_type == "attachment":
                            data.append(payload[1:])
                        elif push_type == "all":
                            data.append(dict(message=msg, payload=payload))

                    count += 1
                    if limit and count >= limit:
                        break

        finally:
            if logout:
                client.logout()

        if push_all:
            self.push(data)
        else:
            for row in data:
                self.push(row)
Beispiel #20
0
    def run(
        self,
        rows,
        conn,
        table,
        cursor=None,
        commit=True,
        rollback=False,
        stmt_type="REPLACE",
        odku=False,
        swap=False,
        keep_old=False,
        push_data=False,
        dry_run=False,
    ):
        """Form SQL statement and use bulk execute to write rows to table

        Parameters
        ----------
        rows
            Iterable of rows to load to the table
        conn
            Database connection
        table : str
            Name of a table to write the data to
        cursor : optional
            Database connection cursor
        commit : bool, optional
            If true try to commit the transaction. If your connection
            autocommits this will have no effect. If this is a SQLAlchemy
            connection and you are in a transaction, it will try to get a
            reference to the current transaction and call commit on that.
        rollback : bool, optional
            If true try to rollback the transaction on exceptions. Behavior
            may vary by backend DB library if you are not currently in a
            transaction.
        stmt_type : str, optional
            Type of SQL statement to use (REPLACE, INSERT, etc.). **Note:** Backend
            support for this varies.
        odku : bool or list, optional
            If true, add ON DUPLICATE KEY UPDATE clause for all columns. If a
            list then only add it for the specified columns. **Note:** Backend
            support for this varies.
        swap : bool, optional
            If true, load a table and then swap it into the target table via rename.
            Not supported with all database back ends.
        keep_old : bool, optional
            If true and swapping tables, keep the original table with a __old
            suffix added to the name
        push_data : bool, optional
            If true, push the data forward instead of the table name
        dry_run : bool, optional
            If true, skip actually loading the data

        """
        load_table = table
        if swap:
            load_table = add_table_suffix(table, "__swap")

        sql = self.get_bulk_statement(conn, stmt_type, load_table, rows, odku=odku)
        dbg("Loading %d rows\n%s" % (size(rows, "n/a"), sqlformat(sql)), indent="label")

        if dry_run:
            warn("dry_run=True, skipping load in %s.run" % self.__class__.__name__)
        else:
            if not cursor:
                cursor = self.get_sql_executor(conn)

            try:
                if swap:
                    self.create_like(conn, cursor, load_table, table, drop=True)

                self.executemany(conn, cursor, sql, rows)

                if swap:
                    old_table = add_table_suffix(table, "__old")
                    self.rename_tables(
                        conn, cursor, [(table, old_table), (load_table, table)]
                    )
                    if not keep_old:
                        self.drop_table(conn, cursor, old_table)

                if commit:
                    self.commit(conn)
            except:
                if rollback:
                    self.rollback(conn)
                raise

        if push_data:
            self.push(rows)
        else:
            self.push(table)