Example #1
0
    def run_spark_job(self):
        command = None
        program = None
        try:
            template = Template(self.program)
            dwd = DataWarehouseDates()
            program = template.render(
                DS=self.ds,
                DS_TODAY=dwd.ds_today(self.ds),
                DS_DATE_ID=dwd.date_id_from_date_str(self.ds),
                DS_DATE_ADD=lambda days: dwd.date_add(self.ds, days),
                DS_TODAY_DATE_ADD=lambda days: dwd.date_add(
                    dwd.ds_today(self.ds), days))

            qubole_name = '%s_%s_%s' % (self.dag_id, self.task_id, self.ds)

            if 'SQL' in self.language.upper():
                program_with_tracking = '-- %s %s %s\n%s' % (
                    self.dag_id, self.task_id, self.ds, program)
                command = SparkCommand.create(sql=program_with_tracking,
                                              arguments=self.arguments,
                                              label=self.label,
                                              name=qubole_name)
            elif 'PYTHON' in self.language.upper(
            ) or 'R' in self.language.upper():
                program_with_tracking = '# %s %s %s\n%s' % (
                    self.dag_id, self.task_id, self.ds, program)
                command = SparkCommand.create(program=program_with_tracking,
                                              language=self.language,
                                              arguments=self.arguments,
                                              label=self.label,
                                              name=qubole_name)
            elif 'SCALA' in self.language.upper():
                program_with_tracking = '// %s %s %s\n%s' % (
                    self.dag_id, self.task_id, self.ds, program)
                command = SparkCommand.create(program=program_with_tracking,
                                              language=self.language,
                                              arguments=self.arguments,
                                              label=self.label,
                                              name=qubole_name)
            else:
                raise AirflowException('Invalid Spark language specified.')

            self.monitor_command(command, program_with_tracking)
        except Exception as e:
            if command is None:
                raise AirflowException(
                    'run_sql call for %s failed. No command Id available.\n%s'
                    % (sql_stmt, e))
            else:
                raise AirflowException(
                    'run_sql call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s'
                    % (program, command.id, e))
Example #2
0
 def run_spark_job(self, query, cluster, verbose=False, **kwargs):
     print('Running Spark job')
     cmd = SparkCommand.create(query=query, label=cluster, **kwargs)
     while cmd.attributes.get('status', None) != 'done':
         if verbose:
             cmd = self._get_logs(cmd)
         else:
             cmd = self._get_status(cmd)
     return cmd
Example #3
0
    def run_spark(self, main_class):
        spark_cmd_line = CmdLineBuilder()
        spark_cmd_line.add("/usr/lib/spark/bin/spark-submit", "--class", main_class)
        spark_cmd_line.extend(self.config_to_command_line())

        # application jar
        spark_cmd_line.add(self.deploy.sync(self.config.main_jar))
        # add user side args
        spark_cmd_line.extend(list_of_strings(self.task.application_args()))

        cmd = SparkCommand.create(
            cmdline=spark_cmd_line.get_cmd_line(safe_curly_brackets=True),
            language="command_line",
            label=self.qubole_config.cluster_label,
            name=self.task.task_id,
        )
        self._handle_qubole_operator_execution(cmd)
Example #4
0
    def run_pyspark(self, pyspark_script):
        # should be reimplemented using SparkSubmitHook (maybe from airflow)
        # note that config jars are not supported.

        arguments = list2cmdline_safe(list_of_strings(
            self.task.application_args()),
                                      safe_curly_brackets=True)

        cmd = SparkCommand.create(
            script_location=self.deploy.sync(pyspark_script),
            language="python",
            user_program_arguments=arguments,
            arguments=list2cmdline_safe(self.config_to_command_line(),
                                        safe_curly_brackets=True),
            label=self.qubole_config.cluster_label,
            name=self.task.task_id,
        )
        self._handle_qubole_operator_execution(cmd)

        return True