Example #1
0
def run_to_pandas(connection, commands, date_col=None, index_col=None):
    result = None

    # Specify the MediaWiki date format for each of the date_cols, if any
    if date_col:
        date_col = ensure_list(date_col)
        date_format = "%Y%m%d%H%M%S"
        date_col = {col: date_format for col in date_col}

    # To-do: SQL syntax errors cause a chain of multiple Python errors
    # The simplest way to fix this is probably to get the raw results and
    # then turn them into a data frame; this would let us avoid using
    # Pandas's complex SQL machinery.
    for command in commands:
        try:
            result = pd.read_sql_query(command,
                                       connection,
                                       index_col=index_col,
                                       parse_dates=date_col)
        # pandas will encounter a TypeError with DDL (e.g. CREATE TABLE) or
        # DML (e.g. INSERT) statements
        except TypeError:
            pass

    return result
Example #2
0
def run(commands, format="pandas", session_type="regular", extra_settings={}):
    """
    Runs SQL commands against the Hive tables in the Data Lake using the
    PySpark SQL interface.

    Arguments:
    * `commands`: the SQL to run. A string for a single command or a list of
      strings for multiple commands within the same session (useful for things
      like setting session variables). Passing more than one query is *not*
      supported; only results from the second will be returned.
    * `format`: the format in which to return data
        * "pandas": a Pandas data frame
        * "raw": a list of tuples, as returned by the Spark SQL interface.
    * `session_type`: the type of Spark session to create.
        * "regular": the default; able to use up to 15% of Hadoop cluster
          resources
        * "large": for queries which require more processing (e.g. joins) or
          which access more data; able to use up to 30% of Hadoop cluster
          resources.
    * `extra_settings`: A dict of additional settings to use when creating
      the Spark session. These will override the defaults specified
      by `session_type`.
    """

    if format not in ["pandas", "raw"]:
        raise ValueError("The `format` should be either `pandas` or `raw`.")
    if session_type not in ["regular", "large"]:
        raise ValueError(
            "'{}' is not a valid Spark session type.".format(session_type))
    commands = ensure_list(commands)

    result = None
    # TODO: Switching the Spark session type has no effect if the previous
    # session is still running.
    spark_session = get_session(type=session_type,
                                extra_settings=extra_settings)
    for cmd in commands:
        cmd_result = spark_session.sql(cmd)
        # If the result has columns, the command was a query and therefore
        # results-producing. If not, it was a DDL or DML command and not
        # results-producing.
        if len(cmd_result.columns) > 0:
            uncollected_result = cmd_result
    if uncollected_result and format == "pandas":
        result = uncollected_result.toPandas()
    elif format == "raw":
        result = uncollected_result.collect()

    start_session_timeout(spark_session)

    return result
Example #3
0
def run(commands, format="pandas", engine="cli"):
    """
    Runs SQL commands against the Hive tables in the Data Lake. Currently,
    this simply passes the commands to the `run_cli` function.
    """

    if format not in ["pandas", "raw"]:
        raise ValueError("The `format` should be either `pandas` or `raw`.")
    if engine not in ["cli"]:
        raise ValueError("'{}' is not a valid engine.".format(engine))
    commands = ensure_list(commands)

    result = None
    if engine == "cli":
        return run_cli(commands, format)
Example #4
0
def run(commands,
        format="pandas",
        heap_size="deprecated",
        engine="deprecated"):
    """
    Runs SQL commands against the Hive tables in the Data Lake.

    Arguments:
    * `commands`: the SQL to run. A string for a single command or a list of
      strings for multiple commands within the same session (useful for things
      like setting session variables). Passing more than one query is *not*
      supported, and will usually result in an error.
    * `format`: what format to return the results in
        * "pandas": a Pandas data frame
        * "raw": a TSV string, as returned by the command line interface. [Deprecated]
    * `heap_size`: [Deprecated]
    * `engine`: [Deprecated]
    """

    if format not in ["pandas", "raw"]:
        raise ValueError("The `format` should be either `pandas` or `raw`.")

    if format == "raw":
        warnings.warn(
            "The 'raw' format is deprecated. It will be removed in the next major release.",
            category=FutureWarning)

    if heap_size != "deprecated":
        warnings.warn(
            "'heap_size' is deprecated. It will be removed in the next major release",
            category=FutureWarning)

    if engine != "deprecated":
        warnings.warn(
            "'engine' is deprecated. It will be removed in the next major release.",
            category=FutureWarning)

    check_kerberos_auth()

    connect_kwargs = {
        "host": HIVE_URL,
        "auth": "KERBEROS",
        "username": pwd.getpwuid(os.getuid()).pw_name,
        "kerberos_service_name": KERBEROS_SERVICE_NAME,
    }

    commands = ensure_list(commands)
    response = None

    with hive.connect(**connect_kwargs) as conn:
        for command in commands:
            if format == "pandas":
                try:
                    # this will work when the command is a SQL query
                    # so the last query in `commands` will return its results
                    response = pd.read_sql(command, conn)
                except TypeError:
                    # The weird thing here is the command actually runs,
                    # Pandas just has trouble when trying to read the result
                    # So when we pass here, we don't need to re-run the command
                    pass

            elif format == "raw":
                cursor = conn.cursor()
                cursor.execute(command)
                response = cursor.fetchall()

    return response
Example #5
0
def run_cli(commands,
            format="pandas",
            heap_size=1024,
            use_nice=True,
            use_ionice=True):
    """
    Runs SQL commands against the Hive tables in the Data Lake using Hive's
    command line interface.

    Arguments:
    * `commands`: the SQL to run. A string for a single command or a list of
      strings for multiple commands within the same session (useful for things
      like setting session variables). Passing more than one query is *not*
      supported, and will usually result in an error.
    * `format`: what format to return the results in
        * "pandas": a Pandas data frame
        * "raw": a TSV string, as returned by the command line interface.
    * `heap_size`: the amount of memory available to the Hive client. Increase
      this if a command experiences an out of memory error.
    * `use_nice`: Run with a lower priority for processor usage.
    * `use_ionice`: Run with a lower priority for disk access.
    """

    commands = ensure_list(commands)
    if format not in ["pandas", "raw"]:
        raise ValueError("'{}' is not a valid format.".format(format))
    check_kerberos_auth()

    shell_command = "export HADOOP_HEAPSIZE={0} && "
    if use_nice:
        shell_command += "/usr/bin/nice "
    if use_ionice:
        shell_command += "/usr/bin/ionice "
    shell_command += "/usr/bin/hive -S -f {1}"

    result = None

    # Support multiple commands by concatenating them in one file. If the user
    # has passed more than one query, this will result in a error when Pandas
    # tries to read the resulting concatenated output (unless the queries
    # happen to produce the same number of columns).
    #
    # Ideally, we would return only the last query's results or throw a clearer
    # error ourselves. However, there's no simple way to determine if multiple
    # queries have been passed or separate their output, so it's not worth
    # the effort.
    merged_commands = ";\n".join(commands)

    try:
        # Create temporary files in current working directory to write to:
        cwd = os.getcwd()
        query_fd, query_path = tempfile.mkstemp(suffix=".hql", dir=cwd)
        results_fd, results_path = tempfile.mkstemp(suffix=".tsv", dir=cwd)

        # Write the Hive query:
        with os.fdopen(query_fd, 'w') as fp:
            fp.write(merged_commands)

        # Execute the Hive query:
        shell_command = shell_command.format(heap_size, query_path)
        hive_call = subprocess.run(shell_command,
                                   shell=True,
                                   stdout=results_fd,
                                   stderr=subprocess.PIPE)
        if hive_call.returncode == 0:
            # Read the results upon successful execution of cmd:
            if format == "pandas":
                try:
                    result = pd.read_csv(results_path, sep='\t')
                except pd.errors.EmptyDataError:
                    # The command had no output
                    pass
            else:
                # If user requested "raw" results, read the text file as-is:
                with open(results_path, 'r') as file:
                    content = file.read()
                    # If the statement had output:
                    if content:
                        result = content
        # If the hive call has not completed successfully
        else:
            # Remove logspam from the standard error so it's easier to see
            # the actual error
            stderr = iter(hive_call.stderr.decode().splitlines())
            cleaned_stderr = ""
            for line in stderr:
                filter = r"JAVA_TOOL_OPTIONS|parquet\.hadoop|WARN:|:WARN|SLF4J"
                if re.search(filter, line) is None:
                    cleaned_stderr += line + "\n"

            raise ChildProcessError(
                "The Hive command line client encountered the following "
                "error:\n{}".format(cleaned_stderr))
    finally:
        # Remove temporary files:
        os.unlink(query_path)
        os.unlink(results_path)

    return result
Example #6
0
def run(commands,
        format="pandas",
        session_type="yarn-regular",
        extra_settings={}):
    """
    Runs SQL commands against the Hive tables in the Data Lake using the
    PySpark SQL interface.

    Note: The session_type and extra_settings will only be applied
    the first time this function is called.  The SparkSession is only instantiated
    once per process, and each subsequent call will re-use the previously
    created SparkSession.

    Arguments:
    * `commands`: the SQL to run. A string for a single command or a list of
      strings for multiple commands within the same session (useful for things
      like setting session variables). Passing more than one query is *not*
      supported; only results from the second will be returned.
    * `format`: the format in which to return data
        * "pandas": a Pandas data frame
        * "raw": a list of tuples, as returned by the Spark SQL interface.
    * `session_type`: the type of Spark session to create.
        * "local": Run the command in a local Spark process. Use this for
          prototyping or querying small-ish data (less than a couple of GB).
        * "yarn-regular": the default; able to use up to 15% of Hadoop cluster
          resources
        * "yarn-large": for queries which require more processing (e.g. joins) or
          which access more data; able to use up to 30% of Hadoop cluster
          resources.
    * `extra_settings`: A dict of additional settings to use when creating
      the Spark session. These will override the defaults specified
      by `session_type`.
    """

    if format not in ["pandas", "raw"]:
        raise ValueError("The `format` should be either `pandas` or `raw`.")
    if format == "raw":
        warnings.warn(
            "The 'raw' format is deprecated. It will be removed in the next major release.",
            category=FutureWarning)

    commands = ensure_list(commands)

    # TODO: Switching the Spark session type has no effect if the previous
    # session is still running.
    spark_session = get_session(type=session_type,
                                extra_settings=extra_settings)

    overall_result = None

    for cmd in commands:
        cmd_result = spark_session.sql(cmd)
        # If the result has columns, the command was a query and therefore
        # results-producing. If not, it was a DDL or DML command and not
        # results-producing.
        if len(cmd_result.columns) > 0:
            overall_result = cmd_result

    if overall_result:
        if format == "pandas":
            overall_result = overall_result.toPandas()
        elif format == "raw":
            overall_result = overall_result.collect()

    # (re)start a timeout on SparkSessions in Yarn after the result is collected.
    # A SparkSession used by this run function
    # will timeout after 5 minutes, unless used again.
    if PREDEFINED_SPARK_SESSIONS[session_type]["master"] == "yarn":
        start_session_timeout(spark_session, 3600)

    return overall_result
Example #7
0
def run(commands,
        dbs,
        use_x1=False,
        format="pandas",
        date_col=None,
        index_col=None):
    """
    Run SQL queries or commands on the Analytics MediaWiki replicas.

    Arguments:
    * `commands`: the SQL to run. A string for a single command or a list of
      strings for multiple commands within the same session (useful for things
      like setting session variables).
    * `dbs`: a string for one database or a list to run the commands on
      multiple databases and concatenate the results.  Possible values:
        * a wiki's database code (e.g. "enwiki", "arwiktionary", "wikidatawiki")
          for its MediaWiki database (or its ExtensionStorage database if
          `use_x1` is passed)
        * "logs" for the EventLogging
        * "centralauth" for global accounts
        * "wikishared" for cross-wiki ExtensionStorage
        * "staging" for user-writable ad-hoc tests and analysis
    * `use_x1`: whether to the connect to the given database on the
      ExtensionStorage replica (only works for wiki databases or "wikishared").
      Default false.
    * `format`: which format to return the data in. "pandas" (the default) means
      a Pandas DataFrame, "raw" means a named tuple consisting of (1) the
      columns names and (2) the records as a list of tuples, the raw format
      specified by Python's database API specification v2.0.
    * `date_col`: if using Pandas format, this parses the specified column or
      columns from MediaWiki datetimes into Pandas datetimes. If using raw
      format, has no effect.
    * `index_col`: if using Pandas format, passed to pandas.read_sql_query to
      set a columns or columns as the index. If using raw format, has no
      effect.
    """

    if format == "raw":
        warnings.warn(
            "The 'raw' format is deprecated. It will be removed in the next major release.",
            category=FutureWarning)

    # Make single command and database parameters lists
    commands = ensure_list(commands)
    dbs = ensure_list(dbs)

    results = []

    if format == "pandas":
        for db in dbs:
            connection = connect(db, use_x1)
            result = run_to_pandas(connection, commands, date_col, index_col)
            connection.close()
            results.append(result)

        if len(dbs) > 1:
            # Ignore the indexes on the partial results unless a custom index
            # column was designated
            if not index_col:
                ignore_index = True
            else:
                ignore_index = False

            return pd.concat(results, ignore_index=ignore_index)
        else:
            return results[0]

    elif format == "raw":
        for db in dbs:
            connection = connect(db, use_x1)
            result = run_to_tuples(connection, commands)
            connection.close()
            results.append(result)

        if len(dbs) > 1:
            # Take the first set of column names since they'll all be the same
            column_names = results[0].column_names

            record_sets = [result.records for result in results]
            records = [x for x in chain(record_sets)]

            return ResultSet(column_names, records)
        else:
            return results[0]

    else:
        raise ValueError("The format you specified is not supported.")
Example #8
0
def run(commands, catalog="analytics_hive"):
    """
    Runs one or more SQL commands using the Presto SQL engine and returns the last result
    in a Pandas DataFrame.
    
    Presto can be connected to many different backend data stores, or catalogs.
    Currently it is only connected to the Data Lake, with has the catalog name "analytics_hive".

    """
    commands = ensure_list(commands)
    check_kerberos_auth()

    USER_NAME = os.getenv("USER")
    PRESTO_AUTH = prestodb.auth.KerberosAuthentication(
        config="/etc/krb5.conf",
        service_name="presto",
        principal=f"{USER_NAME}@WIKIMEDIA",
        ca_bundle="/etc/ssl/certs/Puppet_Internal_CA.pem")

    connection = prestodb.dbapi.connect(catalog=catalog,
                                        host="an-coord1001.eqiad.wmnet",
                                        port=8281,
                                        http_scheme="https",
                                        user=USER_NAME,
                                        auth=PRESTO_AUTH,
                                        source=f"{USER_NAME}, wmfdata-python")

    cursor = connection.cursor()
    final_result = None
    for command in commands:
        cursor.execute(command)
        result = cursor.fetchall()
        description = cursor.description

        # Weirdly, this happens after running a command that doesn't produce results (like a
        # CREATE TABLE or INSERT). Most users can't run those, though.
        # TO-DO: report this as a bug upstream
        if result == [[True]] and description[0][0] == "results":
            pass
        else:
            # Based on
            # https://github.com/prestodb/presto-python-client/issues/56#issuecomment-367432438
            colnames = [col[0] for col in description]
            dtypes = [col[1] for col in description]

            def setup_transform(col, desired_dtype):
                # Only Hive dates/times need special handling
                if desired_dtype in ("timestamp", "date"):
                    return lambda df: pd.to_datetime(df[col])
                else:
                    return lambda df: df[col]

            transformations = {
                col: setup_transform(col, dtype)
                for col, dtype in zip(colnames, dtypes)
            }

            final_result = (pd.DataFrame(
                result, columns=colnames).assign(**transformations))

    cursor.cancel()
    connection.close()

    return final_result