Example #1
0
    def execute(self, context):
        hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id)
        logging.info("Extracting data from Hive")
        logging.info(self.sql)

        if self.bulk_load:
            tmpfile = NamedTemporaryFile()
            hive.to_csv(self.sql,
                        tmpfile.name,
                        delimiter='\t',
                        lineterminator='\n',
                        output_header=False)
        else:
            results = hive.get_records(self.sql)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        if self.mysql_preoperator:
            logging.info("Running MySQL preoperator")
            mysql.run(self.mysql_preoperator)

        logging.info("Inserting rows into MySQL")

        if self.bulk_load:
            mysql.bulk_load(table=self.mysql_table, tmp_file=tmpfile.name)
            tmpfile.close()
        else:
            mysql.insert_rows(table=self.mysql_table, rows=results)

        if self.mysql_postoperator:
            logging.info("Running MySQL postoperator")
            mysql.run(self.mysql_postoperator)

        logging.info("Done.")
Example #2
0
    def execute(self, context):
        hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id)
        logging.info("Extracting data from Hive")
        logging.info(self.sql)

        if self.bulk_load:
            tmpfile = NamedTemporaryFile()
            hive.to_csv(self.sql, tmpfile.name, delimiter='\t',
                lineterminator='\n', output_header=False)
        else:
            results = hive.get_records(self.sql)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        if self.mysql_preoperator:
            logging.info("Running MySQL preoperator")
            mysql.run(self.mysql_preoperator)

        logging.info("Inserting rows into MySQL")

        if self.bulk_load:
            mysql.bulk_load(table=self.mysql_table, tmp_file=tmpfile.name)
            tmpfile.close()
        else:
            mysql.insert_rows(table=self.mysql_table, rows=results)

        if self.mysql_postoperator:
            logging.info("Running MySQL postoperator")
            mysql.run(self.mysql_postoperator)

        logging.info("Done.")
Example #3
0
 def execute(self, context):
     logging.info('Executing: ' + str(self.sql))
     hook = MySqlHook(mysql_conn_id=self.mysql_conn_id)
     hook.run(
         self.sql,
         autocommit=self.autocommit,
         parameters=self.parameters)
Example #4
0
def insert_or_delete_task_state(*args, **kargs):
    ti = kargs["ti"]  ##current task instance
    insert_sql = kargs["insert_sql"]  # insert upstream state sql
    delete_sql = kargs["delete_sql"]  # delete upstream state sql
    upstream_task_id = kargs["up_id"]  # upstream task id
    xcom_pull_key = kargs["xcom_key"]  # key for xcom_pull
    mysql_conn_id = kargs["mysql_conn_id"]  # conn id for database which stores state table

    actual_sql = ""
    ##method 1 of get the upstream task's state
    # up_state = ti.xcom_pull(key=xcom_pull_key, task_ids = upstream_task_id)

    upstream_task_list = kargs["task"].upstream_list
    logging.info("%s upstream task list is %s" % (kargs["task"], upstream_task_list))
    upstream_task_state = (
        upstream_task_list[0]
        .get_task_instances(
            session=settings.Session(), start_date=kargs["execution_date"], end_date=kargs["execution_date"]
        )[0]
        .state
    )
    logging.info("%s upstream task(%s) state is %s" % (kargs["task"], upstream_task_list[0], upstream_task_state))
    if upstream_task_state == "success":
        actual_sql = insert_sql
    elif upstream_task_state == "failed":
        actual_sql = delete_sql
    else:
        actual_sql = "show databases;"  ## if not success or failed, do something effectiveless
    logging.info("upstream state is %s, actual update status sql is %s" % (str(upstream_task_state), str(actual_sql)))
    ##start to execute sql
    logging.info("Executing: %s" % (str(actual_sql)))
    hook = MySqlHook(mysql_conn_id=mysql_conn_id)
    hook.run(actual_sql)
Example #5
0
    def execute(self, context):
        hive = HiveServer2Hook(hiveserver2_conn_id=self.hive_cli_conn_id)
        logging.info("Extracting data from Hive")
        results = hive.get_records(self.sql)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        if self.mysql_preoperator:
            logging.info("Running MySQL preoperator")
            mysql.run(self.mysql_preoperator)
        logging.info("Inserting rows into MySQL")
        mysql.insert_rows(table=self.mysql_table, rows=results)
Example #6
0
    def execute(self, context):
        hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id)
        logging.info("Extracting data from Hive")
        results = hive.get_records(self.sql)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        if self.mysql_preoperator:
            logging.info("Running MySQL preoperator")
            mysql.run(self.mysql_preoperator)
        logging.info("Inserting rows into MySQL")
        mysql.insert_rows(table=self.mysql_table, rows=results)
Example #7
0
    def execute(self, context):
        presto = PrestoHook(presto_conn_id=self.presto_conn_id)
        logging.info("Extracting data from Presto")
        logging.info(self.sql)
        results = hive.get_records(self.sql)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        if self.mysql_preoperator:
            logging.info("Running MySQL preoperator")
            logging.info(self.mysql_preoperator)
            mysql.run(self.mysql_preoperator)

        logging.info("Inserting rows into MySQL")
        mysql.insert_rows(table=self.mysql_table, rows=results)
Example #8
0
    def execute(self, context):
        presto = PrestoHook(presto_conn_id=self.presto_conn_id)
        logging.info("Extracting data from Presto")
        logging.info(self.sql)
        results = presto.get_records(self.sql)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        if self.mysql_preoperator:
            logging.info("Running MySQL preoperator")
            logging.info(self.mysql_preoperator)
            mysql.run(self.mysql_preoperator)

        logging.info("Inserting rows into MySQL")
        mysql.insert_rows(table=self.mysql_table, rows=results)
Example #9
0
    def post_execute(self, context):
        try:
            log_filepath = context["task_instance"].log_filepath.strip('.log')

            # parse the log file into a dict
            parsed = HiveLog.parselog(log_filepath)

            # captures the name of a parent dag in a subdag routine
            dag_ids = context['dag'].dag_id.split('.')
            if len(dag_ids) > 1:
                parsed['parent_dag_id'] = dag_ids[0]
                parsed['dag_id'] = dag_ids[1:].join('.')
            else:
                parsed['parent_dag_id'] = context['dag'].dag_id
                parsed['dag_id'] = parsed['parent_dag_id']

            # Capture extra information from context
            parsed['task_id'] = context['task'].task_id
            parsed['input_date'] = context['ds']
            parsed['owner'] = context['task'].owner
            # Create the table if it doesnt exist
            sql_hook = MySqlHook("airflow_db")
            sql_hook.run(table_creation_string())

            # Create sql insertion strings and insert
            strings = table_insertion_strings(parsed)
            logging.info('Inserting honypot log metadata into database')
            for add_data_to_table_string in strings:
                sql_hook.run(add_data_to_table_string)
            logging.info('Completely log data insertion')
            sql_hook.run('COMMIT;')

        except Exception as e:
            logging.error("Honeypot post executor failed")
            logging.exception(e)
Example #10
0
class MySqlOperator(BaseOperator):
    """
    Executes sql code in a specific mysql database.
    """

    __mapper_args__ = {'polymorphic_identity': 'MySqlOperator'}
    template_fields = ('sql', )
    template_ext = ('.sql', )

    @apply_defaults
    def __init__(self, sql, mysql_conn_id, *args, **kwargs):
        """
        Parameters:
        mysql_conn_id: reference to a specific mysql database
        sql: the sql code you to be executed
        """
        super(MySqlOperator, self).__init__(*args, **kwargs)

        self.hook = MySqlHook(mysql_conn_id=mysql_conn_id)
        self.sql = sql

    def execute(self, execution_date):
        logging.info('Executing: ' + self.sql)
        self.hook.run(self.sql)
Example #11
0
    def execute(self, context=None):
        metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
        table = metastore.get_table(table_name=self.table)
        field_types = {col.name: col.type for col in table.sd.cols}

        exprs = {
            ('', 'count'): 'COUNT(*)'
        }
        for col, col_type in field_types.items():
            d = {}
            if self.assignment_func:
                d = self.assignment_func(col, col_type)
                if d is None:
                    d = self.get_default_exprs(col, col_type)
            else:
                d = self.get_default_exprs(col, col_type)
            exprs.update(d)
        exprs.update(self.extra_exprs)
        exprs = OrderedDict(exprs)
        exprs_str = ",\n        ".join([
            v + " AS " + k[0] + '__' + k[1]
            for k, v in exprs.items()])

        where_clause = [
            "{0} = '{1}'".format(k, v) for k, v in self.partition.items()]
        where_clause = " AND\n        ".join(where_clause)
        sql = """
        SELECT
            {exprs_str}
        FROM {self.table}
        WHERE
            {where_clause};
        """.format(**locals())

        hook = PrestoHook(presto_conn_id=self.presto_conn_id)
        logging.info('Executing SQL check: ' + sql)
        row = hook.get_first(hql=sql)
        logging.info("Record: " + str(row))
        if not row:
            raise Exception("The query returned None")

        part_json = json.dumps(self.partition, sort_keys=True)

        logging.info("Deleting rows from previous runs if they exist")
        mysql = MySqlHook(self.mysql_conn_id)
        sql = """
        SELECT 1 FROM hive_stats
        WHERE
            table_name='{self.table}' AND
            partition_repr='{part_json}' AND
            dttm='{self.dttm}'
        LIMIT 1;
        """.format(**locals())
        if mysql.get_records(sql):
            sql = """
            DELETE FROM hive_stats
            WHERE
                table_name='{self.table}' AND
                partition_repr='{part_json}' AND
                dttm='{self.dttm}';
            """.format(**locals())
            mysql.run(sql)

        logging.info("Pivoting and loading cells into the Airflow db")
        rows = [
            (self.ds, self.dttm, self.table, part_json) +
            (r[0][0], r[0][1], r[1])
            for r in zip(exprs, row)]
        mysql.insert_rows(
            table='hive_stats',
            rows=rows,
            target_fields=[
                'ds',
                'dttm',
                'table_name',
                'partition_repr',
                'col',
                'metric',
                'value',
            ]
        )
Example #12
0
    def execute(self, context=None):
        metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
        table = metastore.get_table(table_name=self.table)
        field_types = {col.name: col.type for col in table.sd.cols}

        exprs = {('', 'count'): 'COUNT(*)'}
        for col, col_type in field_types.items():
            d = {}
            if self.assignment_func:
                d = self.assignment_func(col, col_type)
                if d is None:
                    d = self.get_default_exprs(col, col_type)
            else:
                d = self.get_default_exprs(col, col_type)
            exprs.update(d)
        exprs.update(self.extra_exprs)
        exprs = OrderedDict(exprs)
        exprs_str = ",\n        ".join(
            [v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()])

        where_clause = [
            "{0} = '{1}'".format(k, v) for k, v in self.partition.items()
        ]
        where_clause = " AND\n        ".join(where_clause)
        sql = """
        SELECT
            {exprs_str}
        FROM {self.table}
        WHERE
            {where_clause};
        """.format(**locals())

        hook = PrestoHook(presto_conn_id=self.presto_conn_id)
        logging.info('Executing SQL check: ' + sql)
        row = hook.get_first(hql=sql)
        logging.info("Record: " + str(row))
        if not row:
            raise AirflowException("The query returned None")

        part_json = json.dumps(self.partition, sort_keys=True)

        logging.info("Deleting rows from previous runs if they exist")
        mysql = MySqlHook(self.mysql_conn_id)
        sql = """
        SELECT 1 FROM hive_stats
        WHERE
            table_name='{self.table}' AND
            partition_repr='{part_json}' AND
            dttm='{self.dttm}'
        LIMIT 1;
        """.format(**locals())
        if mysql.get_records(sql):
            sql = """
            DELETE FROM hive_stats
            WHERE
                table_name='{self.table}' AND
                partition_repr='{part_json}' AND
                dttm='{self.dttm}';
            """.format(**locals())
            mysql.run(sql)

        logging.info("Pivoting and loading cells into the Airflow db")
        rows = [(self.ds, self.dttm, self.table, part_json) +
                (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)]
        mysql.insert_rows(table='hive_stats',
                          rows=rows,
                          target_fields=[
                              'ds',
                              'dttm',
                              'table_name',
                              'partition_repr',
                              'col',
                              'metric',
                              'value',
                          ])
Example #13
0
 def execute(self, context):
     logging.info("Executing: " + self.sql)
     hook = MySqlHook(mysql_conn_id=self.mysql_conn_id)
     hook.run(self.sql)
Example #14
0
 def execute(self, context):
     logging.info('Executing: ' + str(self.sql))
     hook = MySqlHook(mysql_conn_id=self.mysql_conn_id)
     hook.run(self.sql,
              autocommit=self.autocommit,
              parameters=self.parameters)
Example #15
0
 def execute(self, context):
     logging.info('Executing: ' + self.sql)
     hook = MySqlHook(mysql_conn_id=self.mysql_conn_id)
     hook.run(self.sql)