Esempio n. 1
0
class MySqlOperator(BaseOperator):
    """
    Executes sql code in a specific MySQL database

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:MySqlOperator`

    :param sql: the sql code to be executed. Can receive a str representing a
        sql statement, a list of str (sql statements), or reference to a template file.
        Template reference are recognized by str ending in '.sql'
        (templated)
    :param mysql_conn_id: Reference to :ref:`mysql connection id <howto/connection:mysql>`.
    :param parameters: (optional) the parameters to render the SQL query with.
        Template reference are recognized by str ending in '.json'
        (templated)
    :param autocommit: if True, each command is automatically committed.
        (default value: False)
    :param database: name of database which overwrite defined one in connection
    """

    template_fields: Sequence[str] = ('sql', 'parameters')
    # TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement.
    template_fields_renderers = {
        'sql': 'mysql' if 'mysql' in wwwutils.get_attr_renderer() else 'sql',
        'parameters': 'json',
    }
    template_ext: Sequence[str] = ('.sql', '.json')
    ui_color = '#ededed'

    def __init__(
        self,
        *,
        sql: Union[str, List[str]],
        mysql_conn_id: str = 'mysql_default',
        parameters: Optional[Union[Mapping, Iterable]] = None,
        autocommit: bool = False,
        database: Optional[str] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.mysql_conn_id = mysql_conn_id
        self.sql = sql
        self.autocommit = autocommit
        self.parameters = parameters
        self.database = database

    def prepare_template(self) -> None:
        """Parse template file for attribute parameters."""
        if isinstance(self.parameters, str):
            self.parameters = ast.literal_eval(self.parameters)

    def execute(self, context: 'Context') -> None:
        self.log.info('Executing: %s', self.sql)
        hook = MySqlHook(mysql_conn_id=self.mysql_conn_id,
                         schema=self.database)
        hook.run(self.sql,
                 autocommit=self.autocommit,
                 parameters=self.parameters)
Esempio n. 2
0
class PrestoToMySqlOperator(BaseOperator):
    """
    Moves data from Presto to MySQL, note that for now the data is loaded
    into memory before being pushed to MySQL, so this operator should
    be used for smallish amount of data.

    :param sql: SQL query to execute against Presto. (templated)
    :param mysql_table: target MySQL table, use dot notation to target a
        specific database. (templated)
    :param mysql_conn_id: Reference to :ref:`mysql connection id <howto/connection:mysql>`.
    :param presto_conn_id: source presto connection
    :param mysql_preoperator: sql statement to run against mysql prior to
        import, typically use to truncate of delete in place
        of the data coming in, allowing the task to be idempotent (running
        the task twice won't double load data). (templated)
    """

    template_fields: Sequence[str] = ('sql', 'mysql_table',
                                      'mysql_preoperator')
    template_ext: Sequence[str] = ('.sql', )
    # TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement.
    template_fields_renderers = {
        "sql":
        "sql",
        "mysql_preoperator":
        "mysql" if "mysql" in wwwutils.get_attr_renderer() else "sql",
    }
    ui_color = '#a0e08c'

    def __init__(
        self,
        *,
        sql: str,
        mysql_table: str,
        presto_conn_id: str = 'presto_default',
        mysql_conn_id: str = 'mysql_default',
        mysql_preoperator: Optional[str] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.sql = sql
        self.mysql_table = mysql_table
        self.mysql_conn_id = mysql_conn_id
        self.mysql_preoperator = mysql_preoperator
        self.presto_conn_id = presto_conn_id

    def execute(self, context: 'Context') -> None:
        presto = PrestoHook(presto_conn_id=self.presto_conn_id)
        self.log.info("Extracting data from Presto: %s", self.sql)
        results = presto.get_records(self.sql)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        if self.mysql_preoperator:
            self.log.info("Running MySQL preoperator")
            self.log.info(self.mysql_preoperator)
            mysql.run(self.mysql_preoperator)

        self.log.info("Inserting rows into MySQL")
        mysql.insert_rows(table=self.mysql_table, rows=results)
Esempio n. 3
0
class RedshiftSQLOperator(BaseOperator):
    """
    Executes SQL Statements against an Amazon Redshift cluster

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:RedshiftSQLOperator`

    :param sql: the SQL code to be executed as a single string, or
        a list of str (sql statements), or a reference to a template file.
        Template references are recognized by str ending in '.sql'
    :param redshift_conn_id: reference to
        :ref:`Amazon Redshift connection id<howto/connection:redshift>`
    :param parameters: (optional) the parameters to render the SQL query with.
    :param autocommit: if True, each command is automatically committed.
        (default value: False)
    """

    template_fields: Sequence[str] = ('sql', )
    template_ext: Sequence[str] = ('.sql', )
    # TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement.
    template_fields_renderers = {
        "sql":
        "postgresql" if "postgresql" in wwwutils.get_attr_renderer() else "sql"
    }

    def __init__(
        self,
        *,
        sql: Union[str, Iterable[str]],
        redshift_conn_id: str = 'redshift_default',
        parameters: Optional[Union[Iterable, Mapping]] = None,
        autocommit: bool = True,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.redshift_conn_id = redshift_conn_id
        self.sql = sql
        self.autocommit = autocommit
        self.parameters = parameters

    def get_hook(self) -> RedshiftSQLHook:
        """Create and return RedshiftSQLHook.
        :return RedshiftSQLHook: A RedshiftSQLHook instance.
        """
        return RedshiftSQLHook(redshift_conn_id=self.redshift_conn_id)

    def execute(self, context: 'Context') -> None:
        """Execute a statement against Amazon Redshift"""
        self.log.info("Executing statement: %s", self.sql)
        hook = self.get_hook()
        hook.run(self.sql,
                 autocommit=self.autocommit,
                 parameters=self.parameters)
Esempio n. 4
0
class PostgresOperator(BaseOperator):
    """
    Executes sql code in a specific Postgres database

    :param sql: the SQL code to be executed as a single string, or
        a list of str (sql statements), or a reference to a template file.
        Template references are recognized by str ending in '.sql'
    :param postgres_conn_id: The :ref:`postgres conn id <howto/connection:postgres>`
        reference to a specific postgres database.
    :param autocommit: if True, each command is automatically committed.
        (default value: False)
    :param parameters: (optional) the parameters to render the SQL query with.
    :param database: name of database which overwrite defined one in connection
    """

    template_fields: Sequence[str] = ('sql', )
    # TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement.
    template_fields_renderers = {
        'sql':
        'postgresql' if 'postgresql' in wwwutils.get_attr_renderer() else 'sql'
    }
    template_ext: Sequence[str] = ('.sql', )
    ui_color = '#ededed'

    def __init__(
        self,
        *,
        sql: Union[str, List[str]],
        postgres_conn_id: str = 'postgres_default',
        autocommit: bool = False,
        parameters: Optional[Union[Mapping, Iterable]] = None,
        database: Optional[str] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.sql = sql
        self.postgres_conn_id = postgres_conn_id
        self.autocommit = autocommit
        self.parameters = parameters
        self.database = database
        self.hook: Optional[PostgresHook] = None

    def execute(self, context: 'Context'):
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                                 schema=self.database)
        self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
        for output in self.hook.conn.notices:
            self.log.info(output)
Esempio n. 5
0
 def setUp(self):
     self.attr_renderer = utils.get_attr_renderer()
Esempio n. 6
0
class MsSqlOperator(BaseOperator):
    """
    Executes sql code in a specific Microsoft SQL database

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:MsSqlOperator`

    This operator may use one of two hooks, depending on the ``conn_type`` of the connection.

    If conn_type is ``'odbc'``, then :py:class:`~airflow.providers.odbc.hooks.odbc.OdbcHook`
    is used.  Otherwise, :py:class:`~airflow.providers.microsoft.mssql.hooks.mssql.MsSqlHook` is used.

    :param sql: the sql code to be executed (templated)
    :param mssql_conn_id: reference to a specific mssql database
    :param parameters: (optional) the parameters to render the SQL query with.
    :param autocommit: if True, each command is automatically committed.
        (default value: False)
    :param database: name of database which overwrite defined one in connection
    """

    template_fields: Sequence[str] = ('sql', )
    template_ext: Sequence[str] = ('.sql', )
    # TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement.
    template_fields_renderers = {
        'sql': 'tsql' if 'tsql' in wwwutils.get_attr_renderer() else 'sql'
    }
    ui_color = '#ededed'

    def __init__(
        self,
        *,
        sql: str,
        mssql_conn_id: str = 'mssql_default',
        parameters: Optional[Union[Mapping, Iterable]] = None,
        autocommit: bool = False,
        database: Optional[str] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.mssql_conn_id = mssql_conn_id
        self.sql = sql
        self.parameters = parameters
        self.autocommit = autocommit
        self.database = database
        self._hook: Optional[Union[MsSqlHook, 'DbApiHook']] = None

    def get_hook(self) -> Optional[Union[MsSqlHook, 'DbApiHook']]:
        """
        Will retrieve hook as determined by :meth:`~.Connection.get_hook` if one is defined, and
        :class:`~.MsSqlHook` otherwise.

        For example, if the connection ``conn_type`` is ``'odbc'``, :class:`~.OdbcHook` will be used.
        """
        if not self._hook:
            conn = MsSqlHook.get_connection(conn_id=self.mssql_conn_id)
            try:
                self._hook = conn.get_hook()
                self._hook.schema = self.database  # type: ignore[union-attr]
            except AirflowException:
                self._hook = MsSqlHook(mssql_conn_id=self.mssql_conn_id,
                                       schema=self.database)
        return self._hook

    def execute(self, context: 'Context') -> None:
        self.log.info('Executing: %s', self.sql)
        hook = self.get_hook()
        hook.run(  # type: ignore[union-attr]
            sql=self.sql,
            autocommit=self.autocommit,
            parameters=self.parameters)
Esempio n. 7
0
class MsSqlToHiveOperator(BaseOperator):
    """
    Moves data from Microsoft SQL Server to Hive. The operator runs
    your query against Microsoft SQL Server, stores the file locally
    before loading it into a Hive table. If the ``create`` or
    ``recreate`` arguments are set to ``True``,
    a ``CREATE TABLE`` and ``DROP TABLE`` statements are generated.
    Hive data types are inferred from the cursor's metadata.
    Note that the table generated in Hive uses ``STORED AS textfile``
    which isn't the most efficient serialization format. If a
    large amount of data is loaded and/or if the table gets
    queried considerably, you may want to use this operator only to
    stage the data into a temporary table before loading it into its
    final destination using a ``HiveOperator``.

    :param sql: SQL query to execute against the Microsoft SQL Server
        database. (templated)
    :param hive_table: target Hive table, use dot notation to target a specific
        database. (templated)
    :param create: whether to create the table if it doesn't exist
    :param recreate: whether to drop and recreate the table at every execution
    :param partition: target partition as a dict of partition columns and
        values. (templated)
    :param delimiter: field delimiter in the file
    :param mssql_conn_id: source Microsoft SQL Server connection
    :param hive_cli_conn_id: Reference to the
        :ref:`Hive CLI connection id <howto/connection:hive_cli>`.
    :param tblproperties: TBLPROPERTIES of the hive table being created
    """

    template_fields: Sequence[str] = ('sql', 'partition', 'hive_table')
    template_ext: Sequence[str] = ('.sql', )
    # TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement.
    template_fields_renderers = {
        'sql': 'tsql' if 'tsql' in wwwutils.get_attr_renderer() else 'sql'
    }
    ui_color = '#a0e08c'

    def __init__(
        self,
        *,
        sql: str,
        hive_table: str,
        create: bool = True,
        recreate: bool = False,
        partition: Optional[Dict] = None,
        delimiter: str = chr(1),
        mssql_conn_id: str = 'mssql_default',
        hive_cli_conn_id: str = 'hive_cli_default',
        tblproperties: Optional[Dict] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.sql = sql
        self.hive_table = hive_table
        self.partition = partition
        self.create = create
        self.recreate = recreate
        self.delimiter = delimiter
        self.mssql_conn_id = mssql_conn_id
        self.hive_cli_conn_id = hive_cli_conn_id
        self.partition = partition or {}
        self.tblproperties = tblproperties

    @classmethod
    def type_map(cls, mssql_type: int) -> str:
        """Maps MsSQL type to Hive type."""
        map_dict = {
            pymssql.BINARY.value: 'INT',
            pymssql.DECIMAL.value: 'FLOAT',
            pymssql.NUMBER.value: 'INT',
        }
        return map_dict.get(mssql_type, 'STRING')

    def execute(self, context: "Context"):
        mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id)
        self.log.info(
            "Dumping Microsoft SQL Server query results to local file")
        with mssql.get_conn() as conn:
            with conn.cursor() as cursor:
                cursor.execute(self.sql)
                with NamedTemporaryFile("w") as tmp_file:
                    csv_writer = csv.writer(tmp_file,
                                            delimiter=self.delimiter,
                                            encoding='utf-8')
                    field_dict = OrderedDict()
                    col_count = 0
                    for field in cursor.description:
                        col_count += 1
                        col_position = f"Column{col_count}"
                        field_dict[col_position if field[0] ==
                                   '' else field[0]] = self.type_map(field[1])
                    csv_writer.writerows(cursor)
                    tmp_file.flush()

            hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
            self.log.info("Loading file into Hive")
            hive.load_file(
                tmp_file.name,
                self.hive_table,
                field_dict=field_dict,
                create=self.create,
                partition=self.partition,
                delimiter=self.delimiter,
                recreate=self.recreate,
                tblproperties=self.tblproperties,
            )
Esempio n. 8
0
# under the License.
"""This module contains an operator to move data from Hive to MySQL."""
from tempfile import NamedTemporaryFile
from typing import TYPE_CHECKING, Dict, Optional, Sequence

from airflow.models import BaseOperator
from airflow.providers.apache.hive.hooks.hive import HiveServer2Hook
from airflow.providers.mysql.hooks.mysql import MySqlHook
from airflow.utils.operator_helpers import context_to_airflow_vars
from airflow.www import utils as wwwutils

if TYPE_CHECKING:
    from airflow.utils.context import Context

# TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement.
MYSQL_RENDERER = 'mysql' if 'mysql' in wwwutils.get_attr_renderer() else 'sql'


class HiveToMySqlOperator(BaseOperator):
    """
    Moves data from Hive to MySQL, note that for now the data is loaded
    into memory before being pushed to MySQL, so this operator should
    be used for smallish amount of data.

    :param sql: SQL query to execute against Hive server. (templated)
    :param mysql_table: target MySQL table, use dot notation to target a
        specific database. (templated)
    :param mysql_conn_id: source mysql connection
    :param metastore_conn_id: Reference to the
        :ref:`metastore thrift service connection id <howto/connection:hive_metastore>`.
    :param mysql_preoperator: sql statement to run against mysql prior to
Esempio n. 9
0
class PostgresOperator(BaseOperator):
    """
    Executes sql code in a specific Postgres database

    :param sql: the SQL code to be executed as a single string, or
        a list of str (sql statements), or a reference to a template file.
        Template references are recognized by str ending in '.sql'
    :param postgres_conn_id: The :ref:`postgres conn id <howto/connection:postgres>`
        reference to a specific postgres database.
    :param autocommit: if True, each command is automatically committed.
        (default value: False)
    :param parameters: (optional) the parameters to render the SQL query with.
    :param database: name of database which overwrite defined one in connection
    """

    template_fields: Sequence[str] = ('sql', )
    # TODO: Remove renderer check when the provider has an Airflow 2.3+ requirement.
    template_fields_renderers = {
        'sql':
        'postgresql' if 'postgresql' in wwwutils.get_attr_renderer() else 'sql'
    }
    template_ext: Sequence[str] = ('.sql', )
    ui_color = '#ededed'

    def __init__(
        self,
        *,
        sql: Union[str, Iterable[str]],
        postgres_conn_id: str = 'postgres_default',
        autocommit: bool = False,
        parameters: Optional[Union[Iterable, Mapping]] = None,
        database: Optional[str] = None,
        runtime_parameters: Optional[Mapping] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.sql = sql
        self.postgres_conn_id = postgres_conn_id
        self.autocommit = autocommit
        self.parameters = parameters
        self.database = database
        self.runtime_parameters = runtime_parameters
        self.hook: Optional[PostgresHook] = None

    def execute(self, context: 'Context'):
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                                 schema=self.database)
        if self.runtime_parameters:
            final_sql = []
            sql_param = {}
            for param in self.runtime_parameters:
                set_param_sql = f"SET {{}} TO %({param})s;"
                dynamic_sql = SQL(set_param_sql).format(Identifier(f"{param}"))
                final_sql.append(dynamic_sql)
            for param, val in self.runtime_parameters.items():
                sql_param.update({f"{param}": f"{val}"})
            if self.parameters:
                sql_param.update(self.parameters)
            if isinstance(self.sql, str):
                final_sql.append(SQL(self.sql))
            else:
                final_sql.extend(list(map(SQL, self.sql)))
            self.hook.run(final_sql, self.autocommit, parameters=sql_param)
        else:
            self.hook.run(self.sql,
                          self.autocommit,
                          parameters=self.parameters)
        for output in self.hook.conn.notices:
            self.log.info(output)
Esempio n. 10
0
 def setUp(self):
     self.attr_renderer = utils.get_attr_renderer()