Beispiel #1
0
    def _execute(self, statement, cursor, wait, session_properties):
        """
        If something goes wrong, `PrestoClient` will attempt to parse the error
        log and present the user with useful debugging information. If that fails,
        the full traceback will be raised instead.
        """
        from pyhive import presto  # Imported here due to slow import performance in Python 3
        from pyhive.exc import DatabaseError  # Imported here due to slow import performance in Python 3
        try:
            cursor = cursor or presto.Cursor(
                host=self.host, port=self.port, username=self.username, password=self.password,
                catalog=self.catalog, schema=self.schema, session_props=session_properties,
                poll_interval=1, source=self.source, protocol=self.server_protocol
            )
            cursor.execute(statement)
            status = cursor.poll()
            if wait:
                logger.progress(0)
                # status None means command executed successfully
                # See https://github.com/dropbox/PyHive/blob/master/pyhive/presto.py#L234
                while status is not None and status['stats']['state'] != "FINISHED":
                    if status['stats'].get('totalSplits', 0) > 0:
                        pct_complete = round(status['stats']['completedSplits'] / float(status['stats']['totalSplits']), 4)
                        logger.progress(pct_complete * 100)
                    status = cursor.poll()
                logger.progress(100, complete=True)
            return cursor
        except (DatabaseError, pandas.io.sql.DatabaseError) as e:
            # Attempt to parse database error, before ultimately reraising the same
            # exception, maintaining the full stacktrace.
            exception, exception_args, traceback = sys.exc_info()

            try:
                message = e.args[0]
                if isinstance(message, six.string_types):
                    message = ast.literal_eval(re.match("[^{]*({.*})[^}]*$", message).group(1))

                linenumber = message['errorLocation']['lineNumber'] - 1
                splt = statement.splitlines()
                splt[linenumber] += '   <--  {errorType} ({errorName}) occurred. {message} '.format(**message)
                context = '\n\n[Error Context]\n{}\n'.format('\n'.join([splt[l] for l in range(max(linenumber - 1, 0),
                                                                                               min(linenumber + 2, len(splt)))]))

                class ErrContext(object):

                    def __repr__(self):
                        return context

                # logged twice so that both notebook and console users see the error context
                exception_args.args = [exception_args, ErrContext()]
                logger.error(context)
            except:
                logger.warn(("Omniduct was unable to parse the database error messages. Refer to the "
                             "traceback below for full error details."))

            if isinstance(exception, type):
                exception = exception(exception_args)

            raise_with_traceback(exception, traceback)
Beispiel #2
0
    def register_from_config(self, config, override=False):
        """
        Register a collection of Duct service configurations.

        The configuration format must be one of the following:
        - An iterable sequence of dictionaries containing a mapping between the
          keyword arguments required to instantiate the `Duct` subclass.
        - A dictionary mapping names of `Duct` instances to dictionaries of
          keyword arguments.
        - A dictionary mapping Duct types ('databases', 'filesystems', etc) to
          mappings like those immediately above.
        - A string YAML representation of one of the above (with at least one
          newline character).
        - A string filename containing such a YAML representation.

        There are three special keyword arguments that are required by the
        `DuctRegistry` instance:
        - name: Should be present only in the configuration dictionary when
          config is provided as an iterable sequence of dictionaries.
        - protocol: Which specifies which `Duct` subclass to fetch. Failure to
          correctly set this will result in a warning and an ignoring of this
          configuration.
        - register_magics (optional): A boolean flag indicating whether to
          register any magics defined by this Duct class (default: True).

        Args:
            config (iterable, dict, str, None): A configuration specified in one
                of the above described formats.
            override (bool): Whether to override any existing `Duct` instance
                of the same name(s). If `False`, any overrides will result in an
                exception.
        """
        # Extract configuration from a file if necessary, and then process it.
        if isinstance(config, six.string_types):
            if '\n' in config:
                config = yaml.safe_load(config)
            else:
                with open(config) as f:
                    config = yaml.safe_load(f.read())
        config = self._process_config(config)

        for duct_config in config:
            names = duct_config.pop('name')
            protocol = duct_config.pop('protocol')
            register_magics = duct_config.pop('register_magics', True)
            try:
                self.new(names,
                         protocol,
                         register_magics=register_magics,
                         override=override,
                         **duct_config)
            except DuctProtocolUnknown as e:
                logger.error(
                    "Failed to configure `Duct` instance(s) '{}'. {}".format(
                        "', '".join(names.split(',')), str(e)))

        return self
Beispiel #3
0
    def _execute(self, statement, query=True, cursor=None, wait=False):
        from pyhive.exc import DatabaseError  # Imported here due to slow import performance in Python 3
        try:
            cursor = cursor or self.__presto.cursor()
            cursor.execute(statement)
            status = cursor.poll()
            if wait or query:
                logger.progress(0)
                while status['stats']['state'] != "FINISHED":
                    if status['stats'].get('totalSplits', 0) > 0:
                        pct_complete = round(
                            status['stats']['completedSplits'] /
                            float(status['stats']['totalSplits']), 4)
                        logger.progress(pct_complete * 100)
                    status = cursor.poll()
                logger.progress(100, complete=True)
            return cursor
        except (DatabaseError, pandas.io.sql.DatabaseError) as e:
            # Attempt to parse database error, before ultimately reraising the same
            # exception, maintaining the full stacktrace.
            exception, exception_args, traceback = sys.exc_info()

            try:
                message = e.args[0]
                if isinstance(message, str):
                    message = ast.literal_eval(
                        re.match("[^{]*({.*})[^}]*$", e.message).group(1))

                linenumber = message['errorLocation']['lineNumber'] - 1
                splt = statement.splitlines()
                splt[
                    linenumber] += '   <--  {errorType} ({errorName}) occurred. {message} '.format(
                        **message)
                context = '\n\n[Error Context]\n{}\n'.format('\n'.join([
                    splt[l]
                    for l in range(max(linenumber -
                                       1, 0), min(linenumber + 2, len(splt)))
                ]))

                class ErrContext(object):
                    def __repr__(self):
                        return context

                # logged twice so that both notebook and console users see the error context
                exception_args.args = [exception_args, ErrContext()]
                logger.error(context)
            except:
                logger.warn((
                    "Omniduct was unable to parse the database error messages. Refer to the "
                    "traceback below for full error details."))

            if isinstance(exception, type):
                exception = exception(exception_args)

            raise_with_traceback(exception, traceback)
Beispiel #4
0
    def import_from_config(self, config):
        config = self._process_config(config)

        for t in [t.value for t in Duct.Type]:
            for names, options in config.get(t, {}).items():
                protocol = options.pop('protocol')
                register_magics = options.pop('register_magics', True)
                try:
                    self.new(names, protocol, register_magics=register_magics, **options)
                except DuctProtocolUnknown as e:
                    logger.error("Failed to configure `Duct` instance(s) '{}'. {}".format("', '".join(names.split(',')), str(e)))

        return self
Beispiel #5
0
    def import_from_config(self, config):
        config = self._process_config(config)

        for t in [t.value for t in Duct.Type]:
            for names, options in config.get(t, {}).items():
                protocol = options.pop('protocol')
                register_magics = options.pop('register_magics', True)
                try:
                    self.new(names, protocol, register_magics=register_magics, **options)
                except DuctProtocolUnknown as e:
                    logger.error("Failed to configure `Duct` instance(s) '{}'. {}".format("', '".join(names.split(',')), str(e)))

        return self
Beispiel #6
0
    def _execute(self, statement, cursor, wait, session_properties):
        """
        If something goes wrong, `PrestoClient` will attempt to parse the error
        log and present the user with useful debugging information. If that fails,
        the full traceback will be raised instead.
        """
        from pyhive import presto  # Imported here due to slow import performance in Python 3
        from pyhive.exc import DatabaseError  # Imported here due to slow import performance in Python 3
        try:
            cursor = cursor or presto.Cursor(host=self.host,
                                             port=self.port,
                                             username=self.username,
                                             password=self.password,
                                             catalog=self.catalog,
                                             schema=self.schema,
                                             session_props=session_properties,
                                             poll_interval=1,
                                             source=self.source,
                                             protocol=self.server_protocol)
            cursor.execute(statement)
            status = cursor.poll()
            if wait:
                logger.progress(0)
                # status None means command executed successfully
                # See https://github.com/dropbox/PyHive/blob/master/pyhive/presto.py#L234
                while status is not None and status['stats'][
                        'state'] != "FINISHED":
                    if status['stats'].get('totalSplits', 0) > 0:
                        pct_complete = round(
                            status['stats']['completedSplits'] /
                            float(status['stats']['totalSplits']), 4)
                        logger.progress(pct_complete * 100)
                    status = cursor.poll()
                logger.progress(100, complete=True)
            return cursor
        except (DatabaseError, pandas.io.sql.DatabaseError) as e:
            # Attempt to parse database error, before ultimately reraising the same
            # exception, maintaining the full stacktrace.
            exception, exception_args, traceback = sys.exc_info()

            try:
                message = e.args[0]
                if isinstance(message, six.string_types):
                    message = ast.literal_eval(
                        re.match("[^{]*({.*})[^}]*$", message).group(1))

                linenumber = message['errorLocation']['lineNumber'] - 1
                splt = statement.splitlines()
                splt[
                    linenumber] += '   <--  {errorType} ({errorName}) occurred. {message} '.format(
                        **message)
                context = '\n\n[Error Context]\n{}\n'.format('\n'.join([
                    splt[l]
                    for l in range(max(linenumber -
                                       1, 0), min(linenumber + 2, len(splt)))
                ]))

                class ErrContext(object):
                    def __repr__(self):
                        return context

                # logged twice so that both notebook and console users see the error context
                exception_args.args = [exception_args, ErrContext()]
                logger.error(context)
            except:
                logger.warn((
                    "Omniduct was unable to parse the database error messages. Refer to the "
                    "traceback below for full error details."))

            if isinstance(exception, type):
                exception = exception(exception_args)

            raise_with_traceback(exception, traceback)
Beispiel #7
0
    def execute(self,
                statement,
                query=False,
                parse=True,
                index_field=None,
                date_fields=None,
                cleanup_statement=True,
                render_only=False,
                **kwargs):
        '''
        Execute a statement against the data source.

        Parameters
        ----------
        statement : The statement to be executed by the query client.
        query : Whether this statement should return data, in which case `query` should be `True`;
            and `False` otherwise.
        parse : Whether the results of this query should be converted to a pandas DataFrame.
        index_field : The field to use as an index in the dataframe, or None.
        date_fields: List of fields to be converted to datetime objects, or None.
        kwargs : Extra keyword arguments to be passed on to _execute, as implemented by subclasses.

        Returns
        -------
        A pandas.DataFrame object if `query` and `parse` are both `True`.
        A DBAPI2 cursor object if `query` is `True`, and `parse` is `False`.
        `None` otherwise.
        '''
        self.connect()
        statements = self.statements_split(statement)
        statements = [
            self.statement_cleanup(stmt) if cleanup_statement else stmt
            for stmt in statements
        ]
        assert len(statements) > 0, "No non-empty statements were provided."
        if render_only:
            return ';\n'.join(statements)
        cursor = None
        for statement in statements[:-1]:
            cursor = self.connect()._execute(statement,
                                             query=False,
                                             cursor=cursor,
                                             **kwargs)
        cursor = self.connect()._execute(statements[-1],
                                         query,
                                         cursor=cursor,
                                         **kwargs)

        if not query or self._cursor_empty(cursor):
            return None
        if parse:
            df = self._cursor_to_dataframe(cursor)
            cursor.close()

            if date_fields is None:  # if user supplied, use as is
                date_fields = config.date_fields or []
                date_fields = [field for field in date_fields if field in df]

            if date_fields:
                try:
                    df = pandas.io.sql._parse_date_columns(df, date_fields)
                except:
                    logger.error(
                        'Unable to parse date columns. Perhaps your version of pandas is outdated.'
                    )
            if index_field is not None:
                df.set_index(index_field, inplace=True)
            return df
        else:
            return cursor
Beispiel #8
0
    def _connect(self):
        """
        The workflow to handle passwords and host keys used by this method is
        inspired by the `pxssh` module of `pexpect` (https://github.com/pexpect/pexpect).
        We have adjusted this workflow to our purposes.
        """
        import pexpect

        # Create socket directory if it doesn't exist.
        socket_dir = os.path.dirname(self._socket_path)
        if not os.path.exists(socket_dir):
            os.makedirs(socket_dir)
        # Create persistent master connection and exit.
        cmd = ''.join([
            "ssh {login} -MT ",
            "-S {socket} ",
            "-o ControlPersist=yes ",
            "-o StrictHostKeyChecking=no ",
            "-o UserKnownHostsFile=/dev/null " if not self.check_known_hosts else "",
            "-o NoHostAuthenticationForLocalhost=yes ",
            "-o ServerAliveInterval=60 ",
            "-o ServerAliveCountMax=2 ",
            "'exit'",
        ]).format(login=self._login_info, socket=self._socket_path)

        expected = [
            "WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED!",    # 0
            "(?i)are you sure you want to continue connecting",    # 1
            "(?i)(?:(?:password)|(?:passphrase for key)):",        # 2
            "(?i)permission denied",                               # 3
            "(?i)terminal type",                                   # 4
            pexpect.TIMEOUT,                                       # 5
            "(?i)connection closed by remote host",                # 6
            "(?i)could not resolve hostname",                      # 7
            pexpect.EOF                                            # 8
        ]

        try:
            expect = pexpect.spawn(cmd)
            i = expect.expect(expected, timeout=10)

            # First phase
            if i == 0:  # If host identification changed, arrest any further attempts to connect
                error_message = (
                    'Host identification for {} has changed! This is most likely '
                    'due to the the server being redeployed or reconfigured but '
                    'may also be due to a man-in-the-middle attack. If you trust '
                    'your network connection, you should be safe to update the '
                    'host keys for this host. To do this manually, please remove '
                    'the line corresponding to this host in ~/.ssh/known_hosts; '
                    'or call the `update_host_keys` method of this client.'.format(self._host)
                )
                if self.interactive:
                    logger.error(error_message)
                    auto_fix = input('Would you like this client to do this for you? (y/n)')
                    if auto_fix == 'y':
                        self.update_host_keys()
                        return self.connect()
                    else:
                        raise RuntimeError("Host keys not updated. Please update keys manually.")
                else:
                    raise RuntimeError(error_message)
            if i == 1:  # Request to authorize host certificate (i.e. host not in the 'known_hosts' file)
                expect.sendline("yes")
                i = self.expect(expected)
            if i == 2:  # Request for password/passphrase
                expect.sendline(self.password or getpass.getpass('Password: '******'ascii')
                i = self.expect(expected)

            # Second phase
            if i == 1:  # Another request to authorize host certificate (i.e. host not in the 'known_hosts' file)
                raise RuntimeError('Received a second request to authorize host key. This should not have happened!')
            elif i in (2, 3):  # Second request for password/passphrase or rejection of credentials. For now, give up.
                raise DuctAuthenticationError('Invalid username and/or password, or private key is not unlocked.')
            elif i == 4:  # Another request for terminal type.
                raise RuntimeError('Received a second request for terminal type. This should not have happened!')
            elif i == 5:  # Timeout
                # In our instance, this means that we have not handled some or another aspect of the login procedure.
                # Since we are expecting an EOF when we have successfully logged in, hanging means that the SSH login
                # procedure is waiting for more information. Since we have no more to give, this means our login
                # was unsuccessful.
                raise RuntimeError('SSH client seems to be awaiting more information, but we have no more to give. The '
                                   'messages received so far are:\n{}'.format(expect.before))
            elif i == 6:  # Connection closed by remote host
                raise RuntimeError("Remote closed SSH connection")
            elif i == 7:
                raise RuntimeError("Cannot connect to {} on your current network connection".format(self.host))
        finally:
            expect.close()

        # We should be logged in at this point, but let us make doubly sure
        assert self.is_connected(), 'Unexpected failure to establish a connection with the remote host with command: \n ' \
                                    '{}\n\n Please report this!'.format(cmd)
Beispiel #9
0
    def _push(self,
              df,
              table,
              partition_clause='',
              overwrite=False,
              schema='omniduct',
              sep='\t'):
        """
        Create a new table in hive from pandas DataFrame.

        Parameters
        ----------
        df : pandas.DataFrame or Series
            Data to be push into a hive table.
        table : str
            Table name for new hive table.
        schema : str
            Schema (or database) for new hive table.
        partition_clause : str
            The hive partition clause specifying which partitions to load data into.
        overwrite : bool, optional
            Whether to overwrite the table data if it exists. Default: False.
        sep : str
            Field delimiter for data.

        See Also
        --------
        https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DML
        """
        # Save dataframe to file.
        _, tmp_path = tempfile.mkstemp(dir='.')
        tmp_fname = os.path.basename(tmp_path)

        logger.info('Saving dataframe to file... {}'.format(tmp_fname))
        df.to_csv(tmp_fname,
                  index=False,
                  header=False,
                  sep=sep,
                  encoding='utf-8')

        # Create table statement.
        cts = _create_table_statement_from_df(df=df,
                                              table=table,
                                              schema=schema,
                                              drop=overwrite
                                              and not partition_clause,
                                              text=True,
                                              sep=sep)
        # Load data statement.
        lds = '\nLOAD DATA LOCAL INPATH "{path}" {overwrite} INTO TABLE {schema}.{table} {partition_clause};'.format(
            path=tmp_fname,
            overwrite="OVERWRITE" if overwrite else "",
            schema=schema,
            table=table,
            partition_clause=partition_clause)

        # SCP data if SSHClient is set.
        if self.remote:
            logger.info('Uploading data to remote host...')
            self.remote.copy_from_local(tmp_fname, tmp_fname)
        # Run create table statement and load data statment.
        logger.info('Creating hive table and loading data...')
        proc = self._run_in_hivecli('\n'.join([cts, lds]))
        if proc.returncode != 0:
            logger.error(proc.stderr)

        # Clean up files.
        logger.info('Cleaning up files...')
        rm_cmd = 'rm -rf {0}'.format(tmp_fname)
        run_in_subprocess(rm_cmd)
        if self.remote:
            self.remote.execute(rm_cmd)
        return proc