Beispiel #1
0
    def test_mysql_to_hive_verify_loaded_values(self):
        mysql_conn_id = 'airflow_ci'
        mysql_table = 'test_mysql_to_hive'
        hive_table = 'test_mysql_to_hive'

        from airflow.hooks.mysql_hook import MySqlHook
        m = MySqlHook(mysql_conn_id)

        try:
            minmax = (255, 65535, 16777215, 4294967295, 18446744073709551615,
                      -128, -32768, -8388608, -2147483648,
                      -9223372036854775808)

            with m.get_conn() as c:
                c.execute("DROP TABLE IF EXISTS {}".format(mysql_table))
                c.execute("""
                    CREATE TABLE {} (
                        c0 TINYINT   UNSIGNED,
                        c1 SMALLINT  UNSIGNED,
                        c2 MEDIUMINT UNSIGNED,
                        c3 INT       UNSIGNED,
                        c4 BIGINT    UNSIGNED,
                        c5 TINYINT,
                        c6 SMALLINT,
                        c7 MEDIUMINT,
                        c8 INT,
                        c9 BIGINT
                    )
                """.format(mysql_table))
                c.execute("""
                    INSERT INTO {} VALUES (
                        {}, {}, {}, {}, {}, {}, {}, {}, {}, {}
                    )
                """.format(mysql_table, *minmax))

            from airflow.operators.mysql_to_hive import MySqlToHiveTransfer
            t = MySqlToHiveTransfer(task_id='test_m2h',
                                    mysql_conn_id=mysql_conn_id,
                                    hive_cli_conn_id='beeline_default',
                                    sql="SELECT * FROM {}".format(mysql_table),
                                    hive_table=hive_table,
                                    recreate=True,
                                    delimiter=",",
                                    dag=self.dag)
            t.run(start_date=DEFAULT_DATE,
                  end_date=DEFAULT_DATE,
                  ignore_ti_state=True)

            from airflow.hooks.hive_hooks import HiveServer2Hook
            h = HiveServer2Hook()
            r = h.get_records("SELECT * FROM {}".format(hive_table))
            self.assertEqual(r[0], minmax)
        finally:
            with m.get_conn() as c:
                c.execute("DROP TABLE IF EXISTS {}".format(mysql_table))
Beispiel #2
0
 def test_mysql_to_hive(self):
     from airflow.operators.mysql_to_hive import MySqlToHiveTransfer
     sql = "SELECT * FROM baby_names LIMIT 1000;"
     op = MySqlToHiveTransfer(
         task_id='test_m2h',
         hive_cli_conn_id='hive_cli_default',
         sql=sql,
         hive_table='test_mysql_to_hive',
         recreate=True,
         delimiter=",",
         dag=self.dag)
     op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
 def test_mysql_to_hive(self):
     from airflow.operators.mysql_to_hive import MySqlToHiveTransfer
     sql = "SELECT * FROM baby_names LIMIT 1000;"
     t = MySqlToHiveTransfer(
         task_id='test_m2h',
         hive_cli_conn_id='hive_cli_default',
         sql=sql,
         hive_table='test_mysql_to_hive',
         recreate=True,
         delimiter=",",
         dag=self.dag)
     t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
Beispiel #4
0
 def test_mysql_to_hive_tblproperties(self):
     from airflow.operators.mysql_to_hive import MySqlToHiveTransfer
     sql = "SELECT * FROM baby_names LIMIT 1000;"
     t = MySqlToHiveTransfer(
         task_id='test_m2h',
         hive_cli_conn_id='beeline_default',
         sql=sql,
         hive_table='test_mysql_to_hive',
         recreate=True,
         delimiter=",",
         tblproperties={'test_property': 'test_value'},
         dag=self.dag)
     t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
Beispiel #5
0
 def test_mysql_to_hive(self):
     # import airflow.operators
     from airflow.operators.mysql_to_hive import MySqlToHiveTransfer
     sql = "SELECT * FROM baby_names LIMIT 1000;"
     t = MySqlToHiveTransfer(task_id='test_m2h',
                             mysql_conn_id='airflow_ci',
                             hive_cli_conn_id='beeline_default',
                             sql=sql,
                             hive_table='test_mysql_to_hive',
                             recreate=True,
                             delimiter=",",
                             dag=self.dag)
     t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, force=True)
Beispiel #6
0
 def test_mysql_to_hive_partition(self):
     from airflow.operators.mysql_to_hive import MySqlToHiveTransfer
     sql = "SELECT * FROM baby_names LIMIT 1000;"
     t = MySqlToHiveTransfer(
         task_id='test_m2h',
         hive_cli_conn_id='beeline_default',
         sql=sql,
         hive_table='test_mysql_to_hive_part',
         partition={'ds': DEFAULT_DATE_DS},
         recreate=False,
         create=True,
         delimiter=",",
         dag=self.dag)
     t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
 def test_mysql_to_hive_partition(self):
     from airflow.operators.mysql_to_hive import MySqlToHiveTransfer
     sql = "SELECT * FROM baby_names LIMIT 1000;"
     t = MySqlToHiveTransfer(
         task_id='test_m2h',
         mysql_conn_id='airflow_ci',
         hive_cli_conn_id='beeline_default',
         sql=sql,
         hive_table='test_mysql_to_hive_part',
         partition={'ds': DEFAULT_DATE_DS},
         recreate=False,
         create=True,
         delimiter=",",
         dag=self.dag)
     t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, force=True)
Beispiel #8
0
 def test_mysql_to_hive_tblproperties(self):
     # import airflow.operators
     from airflow.operators.mysql_to_hive import MySqlToHiveTransfer
     sql = "SELECT * FROM baby_names LIMIT 1000;"
     t = MySqlToHiveTransfer(
         task_id='test_m2h',
         mysql_conn_id='airflow_ci',
         hive_cli_conn_id='beeline_default',
         sql=sql,
         hive_table='test_mysql_to_hive',
         recreate=True,
         delimiter=",",
         tblproperties={'test_property':'test_value'},
         dag=self.dag)
     t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)
Beispiel #9
0
    def test_mysql_to_hive_verify_csv_special_char(self):
        mysql_table = 'test_mysql_to_hive'
        hive_table = 'test_mysql_to_hive'

        from airflow.hooks.mysql_hook import MySqlHook
        hook = MySqlHook()

        try:
            db_record = (
                'c0',
                '["true"]'
            )
            with hook.get_conn() as conn:
                conn.execute("DROP TABLE IF EXISTS {}".format(mysql_table))
                conn.execute("""
                    CREATE TABLE {} (
                        c0 VARCHAR(25),
                        c1 VARCHAR(25)
                    )
                """.format(mysql_table))
                conn.execute("""
                    INSERT INTO {} VALUES (
                        '{}', '{}'
                    )
                """.format(mysql_table, *db_record))

            from airflow.operators.mysql_to_hive import MySqlToHiveTransfer
            import unicodecsv as csv
            op = MySqlToHiveTransfer(
                task_id='test_m2h',
                hive_cli_conn_id='hive_cli_default',
                sql="SELECT * FROM {}".format(mysql_table),
                hive_table=hive_table,
                recreate=True,
                delimiter=",",
                quoting=csv.QUOTE_NONE,
                quotechar='',
                escapechar='@',
                dag=self.dag)
            op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)

            from airflow.providers.apache.hive.hooks.hive import HiveServer2Hook
            hive_hook = HiveServer2Hook()
            result = hive_hook.get_records("SELECT * FROM {}".format(hive_table))
            self.assertEqual(result[0], db_record)
        finally:
            with hook.get_conn() as conn:
                conn.execute("DROP TABLE IF EXISTS {}".format(mysql_table))
Beispiel #10
0
    def test_mysql_to_hive_type_conversion(self, mock_load_file):
        mysql_conn_id = 'airflow_ci'
        mysql_table = 'test_mysql_to_hive'

        from airflow.hooks.mysql_hook import MySqlHook
        m = MySqlHook(mysql_conn_id)

        try:
            with m.get_conn() as c:
                c.execute("DROP TABLE IF EXISTS {}".format(mysql_table))
                c.execute("""
                    CREATE TABLE {} (
                        c0 TINYINT,
                        c1 SMALLINT,
                        c2 MEDIUMINT,
                        c3 INT,
                        c4 BIGINT,
                        c5 TIMESTAMP
                    )
                """.format(mysql_table))

            from airflow.operators.mysql_to_hive import MySqlToHiveTransfer
            t = MySqlToHiveTransfer(task_id='test_m2h',
                                    mysql_conn_id=mysql_conn_id,
                                    hive_cli_conn_id='beeline_default',
                                    sql="SELECT * FROM {}".format(mysql_table),
                                    hive_table='test_mysql_to_hive',
                                    dag=self.dag)
            t.run(start_date=DEFAULT_DATE,
                  end_date=DEFAULT_DATE,
                  ignore_ti_state=True)

            mock_load_file.assert_called_once()
            d = OrderedDict()
            d["c0"] = "SMALLINT"
            d["c1"] = "INT"
            d["c2"] = "INT"
            d["c3"] = "BIGINT"
            d["c4"] = "DECIMAL(38,0)"
            d["c5"] = "TIMESTAMP"
            self.assertEqual(mock_load_file.call_args[1]["field_dict"], d)
        finally:
            with m.get_conn() as c:
                c.execute("DROP TABLE IF EXISTS {}".format(mysql_table))
Beispiel #11
0
    def test_mysql_to_hive_type_conversion(self, mock_load_file):
        mysql_table = 'test_mysql_to_hive'

        from airflow.hooks.mysql_hook import MySqlHook
        hook = MySqlHook()

        try:
            with hook.get_conn() as conn:
                conn.execute("DROP TABLE IF EXISTS {}".format(mysql_table))
                conn.execute("""
                    CREATE TABLE {} (
                        c0 TINYINT,
                        c1 SMALLINT,
                        c2 MEDIUMINT,
                        c3 INT,
                        c4 BIGINT,
                        c5 TIMESTAMP
                    )
                """.format(mysql_table))

            from airflow.operators.mysql_to_hive import MySqlToHiveTransfer
            op = MySqlToHiveTransfer(
                task_id='test_m2h',
                hive_cli_conn_id='hive_cli_default',
                sql="SELECT * FROM {}".format(mysql_table),
                hive_table='test_mysql_to_hive',
                dag=self.dag)
            op.run(start_date=DEFAULT_DATE,
                   end_date=DEFAULT_DATE,
                   ignore_ti_state=True)

            assert mock_load_file.call_count == 1
            ordered_dict = OrderedDict()
            ordered_dict["c0"] = "SMALLINT"
            ordered_dict["c1"] = "INT"
            ordered_dict["c2"] = "INT"
            ordered_dict["c3"] = "BIGINT"
            ordered_dict["c4"] = "DECIMAL(38,0)"
            ordered_dict["c5"] = "TIMESTAMP"
            self.assertEqual(mock_load_file.call_args[1]["field_dict"],
                             ordered_dict)
        finally:
            with hook.get_conn() as conn:
                conn.execute("DROP TABLE IF EXISTS {}".format(mysql_table))
Beispiel #12
0
    def test_mysql_to_hive_type_conversion(self, mock_load_file):
        mysql_conn_id = 'airflow_ci'
        mysql_table = 'test_mysql_to_hive'

        from airflow.hooks.mysql_hook import MySqlHook
        m = MySqlHook(mysql_conn_id)

        try:
            with m.get_conn() as c:
                c.execute("DROP TABLE IF EXISTS {}".format(mysql_table))
                c.execute("""
                    CREATE TABLE {} (
                        c0 TINYINT,
                        c1 SMALLINT,
                        c2 MEDIUMINT,
                        c3 INT,
                        c4 BIGINT
                    )
                """.format(mysql_table))

            from airflow.operators.mysql_to_hive import MySqlToHiveTransfer
            t = MySqlToHiveTransfer(
                task_id='test_m2h',
                mysql_conn_id=mysql_conn_id,
                hive_cli_conn_id='beeline_default',
                sql="SELECT * FROM {}".format(mysql_table),
                hive_table='test_mysql_to_hive',
                dag=self.dag)
            t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)

            mock_load_file.assert_called_once()
            d = OrderedDict()
            d["c0"] = "SMALLINT"
            d["c1"] = "INT"
            d["c2"] = "INT"
            d["c3"] = "BIGINT"
            d["c4"] = "DECIMAL(38,0)"
            self.assertEqual(mock_load_file.call_args[1]["field_dict"], d)
        finally:
            with m.get_conn() as c:
                c.execute("DROP TABLE IF EXISTS {}".format(mysql_table))
    def test_mysql_to_hive_tblproperties(self, mock_popen, mock_temp_dir):
        mock_subprocess = MockSubProcess()
        mock_popen.return_value = mock_subprocess
        mock_temp_dir.return_value = "test_mysql_to_hive_tblproperties"
        with mock.patch.dict('os.environ', self.env_vars):
            sql = "SELECT * FROM baby_names LIMIT 1000;"
            t = MySqlToHiveTransfer(
                task_id='test_m2h',
                hive_cli_conn_id='beeline_default',
                sql=sql,
                hive_table='test_mysql_to_hive_tblproperties',
                recreate=True,
                delimiter=",",
                tblproperties={'test_property': 'test_value'},
                dag=self.dag)
            t.run(start_date=DEFAULT_DATE,
                  end_date=DEFAULT_DATE,
                  ignore_ti_state=True)

        hive_cmd = [
            u'beeline', u'-u', u'"jdbc:hive2://localhost:10000/default"',
            u'-hiveconf', u'[email protected]',
            u'-hiveconf', u'airflow.ctx.dag_id=test_dag_id', u'-hiveconf',
            u'airflow.ctx.dag_owner=airflow', u'-hiveconf',
            u'airflow.ctx.dag_run_id=55', u'-hiveconf',
            u'airflow.ctx.execution_date=2015-01-01T00:00:00+00:00',
            u'-hiveconf', u'airflow.ctx.task_id=test_task_id', u'-hiveconf',
            u'mapreduce.job.queuename=airflow', u'-hiveconf',
            u'mapred.job.queue.name=airflow', u'-hiveconf',
            u'tez.queue.name=airflow', u'-f',
            u'/tmp/airflow_hiveop_test_mysql_to_hive_tblproperties/'
            u'tmptest_mysql_to_hive_tblproperties'
        ]

        mock_popen.assert_called_with(
            hive_cmd,
            stdout=mock_subprocess.PIPE,
            stderr=mock_subprocess.STDOUT,
            cwd=u"/tmp/airflow_hiveop_test_mysql_to_hive_tblproperties",
            close_fds=True)
Beispiel #14
0
    def match_datatransfer_operater(self):
        source_db_type = self.get_conn_db_type(self.source_conn_id)
        destination_db_type = self.get_conn_db_type(self.destination_conn_id)

        if source_db_type == 'mysql' and destination_db_type == 'hiveserver2':
            return MySqlToHiveTransfer(
                sql=self.sql,
                hive_table=self.destination_table,
                create=False,
                recreate=False,
                partition=None,
                delimiter=chr(1),
                mysql_conn_id=self.source_conn_id,
                hive_cli_conn_id=self.destination_conn_id,
                tblproperties=None)

        if source_db_type == 'mssql' and destination_db_type == 'hiveserver2':
            return MsSqlToHiveTransfer(
                sql=self.sql,
                hive_table=self.destination_table,
                create=False,
                recreate=False,
                partition=None,
                delimiter=chr(1),
                mysql_conn_id=self.source_conn_id,
                hive_cli_conn_id=self.destination_conn_id,
                tblproperties=None)

        if source_db_type == 'hiveserver2' and destination_db_type == 'mysql':
            return HiveToMySqlTransfer(sql=self.sql,
                                       mysql_table=self.destination_table,
                                       hiveserver2_conn_id=self.source_conn_id,
                                       mysql_conn_id=self.destination_conn_id,
                                       mysql_preoperator=None,
                                       mysql_postoperator=None,
                                       bulk_load=False)

        if source_db_type == 'oracle' and destination_db_type == 'oracle':
            return OracleToOracleTransfer(
                source_sql=self.sql,
                destination_table=self.destination_table,
                oracle_source_conn_id=self.source_conn_id,
                oracle_destination_conn_id=self.destination_conn_id,
                source_sql_params=None,
                rows_chunk=5000)

        return GenericTransfer(sql=self.sql,
                               destination_table=self.destination_table,
                               source_conn_id=self.source_conn_id,
                               destination_conn_id=self.destination_conn_id,
                               preoperator=None)
Beispiel #15
0
    def test_mysql_to_hive_verify_loaded_values(self):
        mysql_conn_id = 'airflow_ci'
        mysql_table = 'test_mysql_to_hive'
        hive_table = 'test_mysql_to_hive'

        from airflow.hooks.mysql_hook import MySqlHook
        m = MySqlHook(mysql_conn_id)

        try:
            minmax = (
                255,
                65535,
                16777215,
                4294967295,
                18446744073709551615,
                -128,
                -32768,
                -8388608,
                -2147483648,
                -9223372036854775808
            )

            with m.get_conn() as c:
                c.execute("DROP TABLE IF EXISTS {}".format(mysql_table))
                c.execute("""
                    CREATE TABLE {} (
                        c0 TINYINT   UNSIGNED,
                        c1 SMALLINT  UNSIGNED,
                        c2 MEDIUMINT UNSIGNED,
                        c3 INT       UNSIGNED,
                        c4 BIGINT    UNSIGNED,
                        c5 TINYINT,
                        c6 SMALLINT,
                        c7 MEDIUMINT,
                        c8 INT,
                        c9 BIGINT
                    )
                """.format(mysql_table))
                c.execute("""
                    INSERT INTO {} VALUES (
                        {}, {}, {}, {}, {}, {}, {}, {}, {}, {}
                    )
                """.format(mysql_table, *minmax))

            from airflow.operators.mysql_to_hive import MySqlToHiveTransfer
            t = MySqlToHiveTransfer(
                task_id='test_m2h',
                mysql_conn_id=mysql_conn_id,
                hive_cli_conn_id='beeline_default',
                sql="SELECT * FROM {}".format(mysql_table),
                hive_table=hive_table,
                recreate=True,
                delimiter=",",
                dag=self.dag)
            t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)

            from airflow.hooks.hive_hooks import HiveServer2Hook
            h = HiveServer2Hook()
            r = h.get_records("SELECT * FROM {}".format(hive_table))
            self.assertEqual(r[0], minmax)
        finally:
            with m.get_conn() as c:
                c.execute("DROP TABLE IF EXISTS {}".format(mysql_table))
from airflow import DAG
from airflow.operators.mysql_to_hive import MySqlToHiveTransfer
from airflow.utils.dates import days_ago
from datetime import datetime, timedelta

default_args = {
  'ower': 'airflow',
  'start_date': days_ago(5),
  'email_on_failure': False,
  'email_on_retry': False,
  'depends_on_past': False,
  'retries': 1,
  'retry_delay': timedelta(minutes=1)
}

dag = DAG(
  dag_id='mysql_to_hive',
  template_searchpath='templates/mysql_to_hive',
  schedule_interval='@daily',
  default_args=default_args
)

MySqlToHiveTransfer(
  task_id='mysql_to_hive',
  sql='mysql_to_hive_sql.sql',
  delimiter=',',
  hive_table='mysql_to_hive_table.sql',
  create=True,
  recreate=True,
  dag=dag
)
    def test_mysql_to_hive_verify_loaded_values(self, mock_popen,
                                                mock_temp_dir):
        mock_subprocess = MockSubProcess()
        mock_popen.return_value = mock_subprocess
        mock_temp_dir.return_value = "test_mysql_to_hive"

        mysql_table = 'test_mysql_to_hive'
        hive_table = 'test_mysql_to_hive'

        hook = MySqlHook()

        try:
            minmax = (255, 65535, 16777215, 4294967295, 18446744073709551615,
                      -128, -32768, -8388608, -2147483648,
                      -9223372036854775808)

            with hook.get_conn() as conn:
                conn.execute("DROP TABLE IF EXISTS {}".format(mysql_table))
                conn.execute("""
                    CREATE TABLE {} (
                        c0 TINYINT   UNSIGNED,
                        c1 SMALLINT  UNSIGNED,
                        c2 MEDIUMINT UNSIGNED,
                        c3 INT       UNSIGNED,
                        c4 BIGINT    UNSIGNED,
                        c5 TINYINT,
                        c6 SMALLINT,
                        c7 MEDIUMINT,
                        c8 INT,
                        c9 BIGINT
                    )
                """.format(mysql_table))
                conn.execute("""
                    INSERT INTO {} VALUES (
                        {}, {}, {}, {}, {}, {}, {}, {}, {}, {}
                    )
                """.format(mysql_table, *minmax))

            with mock.patch.dict('os.environ', self.env_vars):
                op = MySqlToHiveTransfer(
                    task_id='test_m2h',
                    hive_cli_conn_id='hive_cli_default',
                    sql="SELECT * FROM {}".format(mysql_table),
                    hive_table=hive_table,
                    recreate=True,
                    delimiter=",",
                    dag=self.dag)
                op.run(start_date=DEFAULT_DATE,
                       end_date=DEFAULT_DATE,
                       ignore_ti_state=True)

                mock_cursor = MockConnectionCursor()
                mock_cursor.iterable = [minmax]
                hive_hook = MockHiveServer2Hook(connection_cursor=mock_cursor)

                result = hive_hook.get_records(
                    "SELECT * FROM {}".format(hive_table))
                self.assertEqual(result[0], minmax)

                hive_cmd = [
                    u'hive', u'-hiveconf',
                    u'[email protected]', u'-hiveconf',
                    u'airflow.ctx.dag_id=test_dag_id', u'-hiveconf',
                    u'airflow.ctx.dag_owner=airflow', u'-hiveconf',
                    u'airflow.ctx.dag_run_id=55', u'-hiveconf',
                    u'airflow.ctx.execution_date=2015-01-01T00:00:00+00:00',
                    u'-hiveconf', u'airflow.ctx.task_id=test_task_id',
                    u'-hiveconf', u'mapreduce.job.queuename=airflow',
                    u'-hiveconf', u'mapred.job.queue.name=airflow',
                    u'-hiveconf', u'tez.queue.name=airflow', u'-f',
                    u'/tmp/airflow_hiveop_test_mysql_to_hive/tmptest_mysql_to_hive'
                ]

                mock_popen.assert_called_with(
                    hive_cmd,
                    stdout=mock_subprocess.PIPE,
                    stderr=mock_subprocess.STDOUT,
                    cwd="/tmp/airflow_hiveop_test_mysql_to_hive",
                    close_fds=True)

        finally:
            with hook.get_conn() as conn:
                conn.execute("DROP TABLE IF EXISTS {}".format(mysql_table))