예제 #1
0
    def execute(self, context):
        """
        Execute the bash command in a temporary directory
        which will be cleaned afterwards
        """
        self.log.info('Tmp dir root location: \n %s', gettempdir())

        # Prepare env for child process.
        if self.env is None:
            self.env = os.environ.copy()

        airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True)
        self.log.info('Exporting the following env vars:\n' +
                      '\n'.join(["{}={}".format(k, v)
                                 for k, v in
                                 airflow_context_vars.items()]))
        self.env.update(airflow_context_vars)

        self.lineage_data = self.bash_command

        with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:
            with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as tmp_file:
                tmp_file.write(bytes(self.bash_command, 'utf_8'))
                tmp_file.flush()
                script_location = os.path.abspath(tmp_file.name)
                self.log.info('Temporary script location: %s', script_location)

                def pre_exec():
                    # Restore default signal disposition and invoke setsid
                    for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'):
                        if hasattr(signal, sig):
                            signal.signal(getattr(signal, sig), signal.SIG_DFL)
                    os.setsid()

                self.log.info('Running command: %s', self.bash_command)
                sub_process = Popen(
                    ['bash', tmp_file.name],
                    stdout=PIPE,
                    stderr=STDOUT,
                    cwd=tmp_dir,
                    env=self.env,
                    preexec_fn=pre_exec)

                self.sub_process = sub_process

                self.log.info('Output:')
                line = ''
                for raw_line in iter(sub_process.stdout.readline, b''):
                    line = raw_line.decode(self.output_encoding).rstrip()
                    self.log.info(line)

                sub_process.wait()

                self.log.info('Command exited with return code %s', sub_process.returncode)

                if sub_process.returncode:
                    raise AirflowException('Bash command failed')

        if self.xcom_push_flag:
            return line
 def execute(self, context):
     with NamedTemporaryFile() as tmp_file:
         self.log.info("Fetching file from Hive")
         hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id)
         hive.to_csv(hql=self.hql, csv_filepath=tmp_file.name, hive_conf=context_to_airflow_vars(context))
         self.log.info("Pushing to samba")
         samba = SambaHook(samba_conn_id=self.samba_conn_id)
         samba.push_from_local(self.destination_filepath, tmp_file.name)
예제 #3
0
    def execute(self, context):
        self.log.info('Executing: %s', self.hql)
        self.hook = self.get_hook()

        # set the mapred_job_name if it's not set with dag, task, execution time info
        if not self.mapred_job_name:
            ti = context['ti']
            self.hook.mapred_job_name = 'Airflow HiveOperator task for {}.{}.{}.{}'\
                .format(ti.hostname.split('.')[0], ti.dag_id, ti.task_id,
                        ti.execution_date.isoformat())

        if self.hiveconf_jinja_translate:
            self.hiveconfs = context_to_airflow_vars(context)
        else:
            self.hiveconfs.update(context_to_airflow_vars(context))

        self.log.info('Passing HiveConf: %s', self.hiveconfs)
        self.hook.run_cli(hql=self.hql, schema=self.schema, hive_conf=self.hiveconfs)
 def test_context_to_airflow_vars_all_context(self):
     self.assertDictEqual(
         operator_helpers.context_to_airflow_vars(self.context),
         {
             'airflow.ctx.dag.dag_id': self.dag_id,
             'airflow.ctx.dag_run.execution_date': self.execution_date,
             'airflow.ctx.task.task_id': self.task_id,
             'airflow.ctx.task_instance.execution_date': self.execution_date,
         }
     )
    def test_context_to_airflow_vars_all_context(self):
        self.assertDictEqual(
            operator_helpers.context_to_airflow_vars(self.context),
            {
                'airflow.ctx.dag_id': self.dag_id,
                'airflow.ctx.execution_date': self.execution_date,
                'airflow.ctx.task_id': self.task_id,
                'airflow.ctx.dag_run_id': self.dag_run_id,
            }
        )

        self.assertDictEqual(
            operator_helpers.context_to_airflow_vars(self.context,
                                                     in_env_var_format=True),
            {
                'AIRFLOW_CTX_DAG_ID': self.dag_id,
                'AIRFLOW_CTX_EXECUTION_DATE': self.execution_date,
                'AIRFLOW_CTX_TASK_ID': self.task_id,
                'AIRFLOW_CTX_DAG_RUN_ID': self.dag_run_id,
            }
        )
예제 #6
0
    def test_execute_with_hive_conf(self, mock_hive_hook, mock_mysql_hook):
        context = {}
        self.kwargs.update(dict(hive_conf={'mapreduce.job.queuename': 'fake_queue'}))

        HiveToMySqlTransfer(**self.kwargs).execute(context=context)

        hive_conf = context_to_airflow_vars(context)
        hive_conf.update(self.kwargs['hive_conf'])
        mock_hive_hook.return_value.get_records.assert_called_once_with(
            self.kwargs['sql'],
            hive_conf=hive_conf
        )
    def test_execute(self, mock_tmp_file, mock_hive_hook, mock_samba_hook):
        type(mock_tmp_file).name = PropertyMock(return_value='tmp_file')
        mock_tmp_file.return_value.__enter__ = Mock(return_value=mock_tmp_file)
        context = {}

        Hive2SambaOperator(**self.kwargs).execute(context)

        mock_hive_hook.assert_called_once_with(hiveserver2_conn_id=self.kwargs['hiveserver2_conn_id'])
        mock_hive_hook.return_value.to_csv.assert_called_once_with(
            hql=self.kwargs['hql'],
            csv_filepath=mock_tmp_file.name,
            hive_conf=context_to_airflow_vars(context))
        mock_samba_hook.assert_called_once_with(samba_conn_id=self.kwargs['samba_conn_id'])
        mock_samba_hook.return_value.push_from_local.assert_called_once_with(
            self.kwargs['destination_filepath'], mock_tmp_file.name)
    def execute(self, context):
        # Export context to make it available for callables to use.
        airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True)
        self.log.info("Exporting the following env vars:\n" +
                      '\n'.join(["{}={}".format(k, v)
                                 for k, v in airflow_context_vars.items()]))
        os.environ.update(airflow_context_vars)

        if self.provide_context:
            context.update(self.op_kwargs)
            context['templates_dict'] = self.templates_dict
            self.op_kwargs = context

        return_value = self.execute_callable()
        self.log.info("Done. Returned value was: %s", return_value)
        return return_value
예제 #9
0
    def get_env(self, context):
        """Builds the set of environment variables to be exposed for the bash command"""
        system_env = os.environ.copy()
        env = self.env
        if env is None:
            env = system_env
        else:
            if self.append_env:
                system_env.update(env)
                env = system_env

        airflow_context_vars = context_to_airflow_vars(context,
                                                       in_env_var_format=True)
        self.log.debug(
            'Exporting the following env vars:\n%s',
            '\n'.join(f"{k}={v}" for k, v in airflow_context_vars.items()),
        )
        env.update(airflow_context_vars)
        return env
예제 #10
0
    def execute(self, context: Dict):
        # Export context to make it available for callables to use.
        airflow_context_vars = context_to_airflow_vars(context,
                                                       in_env_var_format=True)
        self.log.info(
            "Exporting the following env vars:\n%s", '\n'.join([
                "{}={}".format(k, v) for k, v in airflow_context_vars.items()
            ]))
        os.environ.update(airflow_context_vars)

        context.update(self.op_kwargs)
        context['templates_dict'] = self.templates_dict

        self.op_kwargs = PythonOperator.determine_op_kwargs(
            self.python_callable, context, len(self.op_args))

        return_value = self.execute_callable()
        self.log.info("Done. Returned value was: %s", return_value)
        return return_value
예제 #11
0
    def execute(self, context):
        if self.env is None:
            self.env = os.environ.copy()

        airflow_context_vars = context_to_airflow_vars(context,
                                                       in_env_var_format=True)

        self.env.update(airflow_context_vars)

        self.lineage_data = self.filename

        with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:

            def pre_exec():
                for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'):
                    if hasattr(signal, sig):
                        signal.signal(getattr(signal, sig), signal.SIG_DFL)
                os.setsid()

            self.log.info("Running Golang program: %s", self.filename)
            sp = Popen(['go', 'run', self.filename],
                       stdout=PIPE,
                       stderr=STDOUT,
                       cwd=tmp_dir,
                       env=self.env,
                       preexec_fn=pre_exec)

            self.sp = sp

            self.log.info("Output:")
            line = ''
            for line in iter(sp.stdout.readline, b''):
                line = line.decode(self.output_encoding).rstrip()
                self.log.info(line)
            sp.wait()
            self.log.info("Command exited with return code %s", sp.returncode)

            if sp.returncode:
                raise AirflowException("Golang program failed")

        if self.xcom_push_flag:
            return line
예제 #12
0
    def execute(self, context):
        # Export context to make it available for callables to use.
        airflow_context_vars = context_to_airflow_vars(context,
                                                       in_env_var_format=True)
        self.log.info(
            "Exporting the following env vars:\n%s",
            "\n".join([
                "{}={}".format(k, v) for k, v in airflow_context_vars.items()
            ]),
        )
        os.environ.update(airflow_context_vars)

        if self.provide_context:
            context.update(self.op_kwargs)
            context["templates_dict"] = self.templates_dict
            self.op_kwargs = context

        return_value = self.execute_callable()
        self.log.info("Done. Returned value was: %s", return_value)
        return return_value
예제 #13
0
    def test_execute_bulk_load(self, mock_hive_hook, mock_tmp_file,
                               mock_mysql_hook):
        type(mock_tmp_file).name = PropertyMock(return_value='tmp_file')
        context = {}
        self.kwargs.update(dict(bulk_load=True))

        HiveToMySqlTransferOperator(**self.kwargs).execute(context=context)

        mock_tmp_file.assert_called_once_with()
        mock_hive_hook.return_value.to_csv.assert_called_once_with(
            self.kwargs['sql'],
            mock_tmp_file.return_value.name,
            delimiter='\t',
            lineterminator='\n',
            output_header=False,
            hive_conf=context_to_airflow_vars(context))
        mock_mysql_hook.return_value.bulk_load.assert_called_once_with(
            table=self.kwargs['mysql_table'],
            tmp_file=mock_tmp_file.return_value.name)
        mock_tmp_file.return_value.close.assert_called_once_with()
예제 #14
0
    def test_execute_with_hive_conf(self, mock_mysql_hook):
        context = {}
        mock_hive_hook = MockHiveServer2Hook()
        mock_hive_hook.get_records = MagicMock(
            return_value='test_hive_results')

        self.kwargs.update(
            dict(hive_conf={'mapreduce.job.queuename': 'fake_queue'}))

        with patch(
                'airflow.providers.apache.hive.transfers.hive_to_mysql.HiveServer2Hook',
                return_value=mock_hive_hook,
        ):
            HiveToMySqlOperator(**self.kwargs).execute(context=context)

            hive_conf = context_to_airflow_vars(context)
            hive_conf.update(self.kwargs['hive_conf'])

        mock_hive_hook.get_records.assert_called_once_with(self.kwargs['sql'],
                                                           hive_conf=hive_conf)
예제 #15
0
    def test_execute_bulk_load(self, mock_hive_hook, mock_tmp_file, mock_mysql_hook):
        type(mock_tmp_file).name = PropertyMock(return_value='tmp_file')
        context = {}
        self.kwargs.update(dict(bulk_load=True))

        HiveToMySqlTransfer(**self.kwargs).execute(context=context)

        mock_tmp_file.assert_called_once_with()
        mock_hive_hook.return_value.to_csv.assert_called_once_with(
            self.kwargs['sql'],
            mock_tmp_file.return_value.name,
            delimiter='\t',
            lineterminator='\n',
            output_header=False,
            hive_conf=context_to_airflow_vars(context)
        )
        mock_mysql_hook.return_value.bulk_load.assert_called_once_with(
            table=self.kwargs['mysql_table'],
            tmp_file=mock_tmp_file.return_value.name
        )
        mock_tmp_file.return_value.close.assert_called_once_with()
예제 #16
0
    def execute(self, context):
        hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id)

        self.log.info("Extracting data from Hive: %s", self.sql)
        hive_conf = context_to_airflow_vars(context)
        if self.hive_conf:
            hive_conf.update(self.hive_conf)
        if self.bulk_load:
            tmp_file = NamedTemporaryFile()
            hive.to_csv(
                self.sql,
                tmp_file.name,
                delimiter='\t',
                lineterminator='\n',
                output_header=False,
                hive_conf=hive_conf,
            )
        else:
            hive_results = hive.get_records(self.sql, hive_conf=hive_conf)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)

        if self.mysql_preoperator:
            self.log.info("Running MySQL preoperator")
            mysql.run(self.mysql_preoperator)

        self.log.info("Inserting rows into MySQL")
        if self.bulk_load:
            mysql.bulk_load(table=self.mysql_table, tmp_file=tmp_file.name)
            tmp_file.close()
        else:
            mysql.insert_rows(table=self.mysql_table, rows=hive_results)

        if self.mysql_postoperator:
            self.log.info("Running MySQL postoperator")
            mysql.run(self.mysql_postoperator)

        self.log.info("Done.")
예제 #17
0
    def test_execute_bulk_load(self, mock_hive_hook, mock_tmp_file_context,
                               mock_mysql_hook):
        mock_tmp_file = MagicMock()
        mock_tmp_file.name = 'tmp_file'
        mock_tmp_file_context.return_value.__enter__.return_value = mock_tmp_file
        context = {}
        self.kwargs.update(dict(bulk_load=True))

        HiveToMySqlOperator(**self.kwargs).execute(context=context)

        mock_tmp_file_context.assert_called_once_with()
        mock_hive_hook.return_value.to_csv.assert_called_once_with(
            self.kwargs['sql'],
            'tmp_file',
            delimiter='\t',
            lineterminator='\n',
            output_header=False,
            hive_conf=context_to_airflow_vars(context),
        )
        mock_mysql_hook.return_value.bulk_load.assert_called_once_with(
            table=self.kwargs['mysql_table'], tmp_file='tmp_file')
        mock_tmp_file_context.return_value.__exit__.assert_called_once_with(
            None, None, None)
예제 #18
0
    def execute(self, context):
        hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id)

        self.log.info("Extracting data from Hive: %s", self.sql)
        hive_conf = context_to_airflow_vars(context)
        if self.hive_conf:
            hive_conf.update(self.hive_conf)
        if self.bulk_load:
            tmp_file = NamedTemporaryFile()
            hive.to_csv(self.sql,
                        tmp_file.name,
                        delimiter='\t',
                        lineterminator='\n',
                        output_header=False,
                        hive_conf=hive_conf)
        else:
            hive_results = hive.get_records(self.sql, hive_conf=hive_conf)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)

        if self.mysql_preoperator:
            self.log.info("Running MySQL preoperator")
            mysql.run(self.mysql_preoperator)

        self.log.info("Inserting rows into MySQL")
        if self.bulk_load:
            mysql.bulk_load(table=self.mysql_table, tmp_file=tmp_file.name)
            tmp_file.close()
        else:
            mysql.insert_rows(table=self.mysql_table, rows=hive_results)

        if self.mysql_postoperator:
            self.log.info("Running MySQL postoperator")
            mysql.run(self.mysql_postoperator)

        self.log.info("Done.")
예제 #19
0
    def execute(self, context):
        """
        Execute the R command or script in a temporary directory
        """

        # Export additional environment variables
        os.environ.update(self.env)

        # Export context as environment variables
        airflow_context_vars = context_to_airflow_vars(context,
                                                       in_env_var_format=True)
        self.log.info(
            'Exporting the following env vars:\n%s', '\n'.join([
                "{}={}".format(k, v) for k, v in airflow_context_vars.items()
            ]))
        os.environ.update(airflow_context_vars)

        with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:
            with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f:

                f.write(bytes(self.r_command, 'utf_8'))
                f.flush()
                fname = f.name
                script_location = os.path.abspath(fname)

                self.log.info("Temporary script location: %s", script_location)
                self.log.info("Running command(s):\n%s", self.r_command)

                try:
                    res = robjects.r.source(fname, echo=False)
                except RRuntimeError as e:
                    self.log.error("Received R error: %s", e)
                    res = None

                # This will be a pickled rpy2.robjects.vectors.ListVector
                return res
예제 #20
0
 def test_context_to_airflow_vars_empty_context(self):
     self.assertDictEqual(operator_helpers.context_to_airflow_vars({}), {})
예제 #21
0
 def test_context_to_airflow_vars_empty_context(self):
     assert operator_helpers.context_to_airflow_vars({}) == {}
    def execute(self, context):
        """
        Execute the bash command in a temporary directory
        which will be cleaned afterwards
        """
        self.log.info('Tmp dir root location: \n %s', gettempdir())

        # Prepare env for child process.
        env = self.env
        if env is None:
            env = os.environ.copy()

        airflow_context_vars = context_to_airflow_vars(context,
                                                       in_env_var_format=True)
        self.log.info(
            'Exporting the following env vars:\n%s', '\n'.join([
                "{}={}".format(k, v) for k, v in airflow_context_vars.items()
            ]))
        env.update(airflow_context_vars)

        self.lineage_data = self.bash_command

        with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:
            with NamedTemporaryFile(dir=tmp_dir,
                                    prefix=self.task_id) as tmp_file:
                tmp_file.write(bytes(self.bash_command, 'utf_8'))
                tmp_file.flush()
                script_location = os.path.abspath(tmp_file.name)
                self.log.info('Temporary script location: %s', script_location)

                def pre_exec():
                    # Restore default signal disposition and invoke setsid
                    for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'):
                        if hasattr(signal, sig):
                            signal.signal(getattr(signal, sig), signal.SIG_DFL)
                    os.setsid()

                self.log.info('Running command: %s', self.bash_command)
                sub_process = Popen(['bash', tmp_file.name],
                                    stdout=PIPE,
                                    stderr=STDOUT,
                                    cwd=tmp_dir,
                                    env=env,
                                    preexec_fn=pre_exec)

                self.sub_process = sub_process

                self.log.info('Output:')
                line = ''
                for raw_line in iter(sub_process.stdout.readline, b''):
                    line = raw_line.decode(self.output_encoding).rstrip()
                    self.log.info(line)

                sub_process.wait()

                self.log.info('Command exited with return code %s',
                              sub_process.returncode)

                if sub_process.returncode:
                    raise AirflowException('Bash command failed')

        return line
예제 #23
0
 def execute(self, context):
     _log.info('Executing: ' + self.hql)
     self.hook = self.get_hook()
     self.hook.run_cli(hql=self.hql, schema=self.schema,
                       hive_conf=context_to_airflow_vars(context))
 def test_context_to_airflow_vars_empty_context(self):
     self.assertDictEqual(operator_helpers.context_to_airflow_vars({}), {})
예제 #25
0
    def execute(self, context):
        """
        Execute the bash command in a temporary directory
        which will be cleaned afterwards
        """
        self.log.info("Tmp dir root location: \n %s", gettempdir())

        # Prepare env for child process.
        env = self.env
        if env is None:
            env = os.environ.copy()
        airflow_context_vars = context_to_airflow_vars(context,
                                                       in_env_var_format=True)
        self.log.debug(
            "Exporting the following env vars:\n%s",
            "\n".join([
                "{}={}".format(k, v) for k, v in airflow_context_vars.items()
            ]),
        )
        env.update(airflow_context_vars)

        self.lineage_data = self.bash_command

        with TemporaryDirectory(prefix="airflowtmp") as tmp_dir:
            with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f:

                f.write(bytes(self.bash_command, "utf_8"))
                f.flush()
                fname = f.name
                script_location = os.path.abspath(fname)
                self.log.info("Temporary script location: %s", script_location)

                def pre_exec():
                    # Restore default signal disposition and invoke setsid
                    for sig in ("SIGPIPE", "SIGXFZ", "SIGXFSZ"):
                        if hasattr(signal, sig):
                            signal.signal(getattr(signal, sig), signal.SIG_DFL)
                    os.setsid()

                self.log.info("Running command: %s", self.bash_command)
                self.sub_process = Popen(["bash", fname],
                                         stdout=PIPE,
                                         stderr=STDOUT,
                                         cwd=tmp_dir,
                                         env=env,
                                         preexec_fn=pre_exec)

                self.log.info("Output:")
                line = ""
                for line in iter(self.sub_process.stdout.readline, b""):
                    line = line.decode(self.output_encoding).rstrip()
                    self.log.info(line)
                self.sub_process.wait()
                self.log.info("Command exited with return code %s",
                              self.sub_process.returncode)

                if self.sub_process.returncode:
                    raise AirflowException("Bash command failed")

        if self.xcom_push_flag:
            return line
예제 #26
0
    def execute(self, context):
        try:
            """
                Execute the bash command in a temporary directory
                which will be cleaned afterwards
                """
            self.log.info("Tmp dir root location: \n %s", gettempdir())

            # Prepare env for child process.
            if self.env is None:
                self.env = os.environ.copy()
            airflow_context_vars = context_to_airflow_vars(
                context, in_env_var_format=True)
            self.log.info("Exporting the following env vars:\n" + '\n'.join([
                "{}={}".format(k, v) for k, v in airflow_context_vars.items()
            ]))
            self.env.update(airflow_context_vars)

            self.lineage_data = self.bash_command

            with TemporaryDirectory(prefix='airflowtmp') as tmp_dir:
                with NamedTemporaryFile(dir=tmp_dir, prefix=self.task_id) as f:

                    f.write(bytes(self.bash_command, 'utf_8'))
                    f.flush()
                    fname = f.name
                    script_location = os.path.abspath(fname)
                    self.log.info("Temporary script location: %s",
                                  script_location)

                    def pre_exec():
                        # Restore default signal disposition and invoke setsid
                        for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'):
                            if hasattr(signal, sig):
                                signal.signal(getattr(signal, sig),
                                              signal.SIG_DFL)
                        os.setsid()

                    self.log.info("Running command: %s", self.bash_command)
                    sp = Popen(['bash', fname],
                               stdout=PIPE,
                               stderr=STDOUT,
                               cwd=tmp_dir,
                               env=self.env,
                               preexec_fn=pre_exec)
                    self.sp = sp
                    self.log.info("Output:")
                    line = ''
                    for line in iter(sp.stdout.readline, b''):
                        line = line.decode(self.output_encoding).rstrip()
                        self.log.info(line)
                    sp.wait()
                    self.log.info("Command exited with return code %s",
                                  sp.returncode)

                    if sp.returncode:
                        raise AirflowException("Bash command failed")

            if self.xcom_push_flag:
                return line
        except Exception as e:
            self.log.info("exec throw a exception " + str(e))
            with open(str(path_flag + self.flag + ".txt"),
                      mode='a',
                      encoding='utf-8') as f:
                f.write(self.flag + " " + str(self.task_name) +
                        " run failed !!!!!\n")
                f.write("error is " + str(e) + "\n")
                f.close()
                raise Exception