Example #1
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        vertica = VerticaHook(vertica_conn_id=self.vertica_conn_id)

        self.log.info("Dumping Vertica query results to local file")
        conn = vertica.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql)
        with NamedTemporaryFile("w") as f:
            csv_writer = csv.writer(f,
                                    delimiter=self.delimiter,
                                    encoding='utf-8')
            field_dict = OrderedDict()
            col_count = 0
            for field in cursor.description:
                col_count += 1
                col_position = "Column{position}".format(position=col_count)
                field_dict[col_position if field[0] == '' else field[0]] = \
                    self.type_map(field[1])
            csv_writer.writerows(cursor.iterate())
            f.flush()
            cursor.close()
            conn.close()
            self.log.info("Loading file into Hive")
            hive.load_file(f.name,
                           self.hive_table,
                           field_dict=field_dict,
                           create=self.create,
                           partition=self.partition,
                           delimiter=self.delimiter,
                           recreate=self.recreate)
Example #2
0
def get_compliance_status(**kwargs):
    cur = VerticaHook('vertica').get_cursor()
    sql = "SELECT GET_COMPLIANCE_STATUS()"
    cur.execute(sql)
    result = cur.fetchall()
    print(result[0][0])
    return result[0][0]
Example #3
0
 def __init__(self, vertica_conn_id='vertica_default',
              checker=lambda x: True,
              sql='select 1',
              *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.hook = VerticaHook(vertica_conn_id=vertica_conn_id)
     self.checker = checker
     self.sql = sql
Example #4
0
def workflow(src_conn_id, src_schema, dt, target_conn_id, target_table):
    # TODO Split code into functions
    etl_conn = PostgresHook(postgres_conn_id='etl').get_conn()

    with Session(etl_conn, f'{dag.dag_id}__{src_schema}') as session:
        # Load data
        source_conn = MsSqlHook(mssql_conn_id=src_conn_id,
                                schema=src_schema).get_conn()

        query = f"""
            SELECT 
                id, start_time, end_time, type, data
            FROM dbo.Orders
            WHERE
                CONVERT(DATE, start_time) = '{dt}'
            """

        df = pd.read_sql_query(query, source_conn)

        # Skip if no rows
        if df.empty:
            raise AirflowSkipException('No rows to load')

        # Add service fields
        df['etl_source'] = src_schema
        df['etl_id'] = session.id
        df['hash_id'] = hash_pandas_object(df[['etl_source', 'id']])

        # Export data to CSV buffer
        buffer = StringIO()
        df.to_csv(buffer,
                  index=False,
                  sep='|',
                  na_rep='NUL',
                  quoting=csv.QUOTE_MINIMAL,
                  header=False,
                  float_format='%.8f',
                  doublequote=False,
                  escapechar='\\')
        buffer.seek(0)

        # Push CSV
        target_conn = VerticaHook(vertica_conn_id=target_conn_id).get_conn()

        copy_stmt = f"""
            COPY {target_table}({df.columns.to_list()}) 
            FROM STDIN 
            DELIMITER '|' 
            ENCLOSED '"' 
            ABORT ON ERROR 
            NULL 'NUL'
            """

        cursor = target_conn.cursor()
        cursor.copy(copy_stmt, buffer)

        session.loaded_rows = cursor.rowcount
        session.successful = True
Example #5
0
 def get_hook(self):
     try:
         if self.conn_type == 'mysql':
             from airflow.hooks.mysql_hook import MySqlHook
             return MySqlHook(mysql_conn_id=self.conn_id)
         elif self.conn_type == 'google_cloud_platform':
             from airflow.contrib.hooks.bigquery_hook import BigQueryHook
             return BigQueryHook(bigquery_conn_id=self.conn_id)
         elif self.conn_type == 'postgres':
             from airflow.hooks.postgres_hook import PostgresHook
             return PostgresHook(postgres_conn_id=self.conn_id)
         elif self.conn_type == 'hive_cli':
             from airflow.hooks.hive_hooks import HiveCliHook
             return HiveCliHook(hive_cli_conn_id=self.conn_id)
         elif self.conn_type == 'presto':
             from airflow.hooks.presto_hook import PrestoHook
             return PrestoHook(presto_conn_id=self.conn_id)
         elif self.conn_type == 'hiveserver2':
             from airflow.hooks.hive_hooks import HiveServer2Hook
             return HiveServer2Hook(hiveserver2_conn_id=self.conn_id)
         elif self.conn_type == 'sqlite':
             from airflow.hooks.sqlite_hook import SqliteHook
             return SqliteHook(sqlite_conn_id=self.conn_id)
         elif self.conn_type == 'jdbc':
             from airflow.hooks.jdbc_hook import JdbcHook
             return JdbcHook(jdbc_conn_id=self.conn_id)
         elif self.conn_type == 'mssql':
             from airflow.hooks.mssql_hook import MsSqlHook
             return MsSqlHook(mssql_conn_id=self.conn_id)
         elif self.conn_type == 'oracle':
             from airflow.hooks.oracle_hook import OracleHook
             return OracleHook(oracle_conn_id=self.conn_id)
         elif self.conn_type == 'vertica':
             from airflow.contrib.hooks.vertica_hook import VerticaHook
             return VerticaHook(vertica_conn_id=self.conn_id)
         elif self.conn_type == 'cloudant':
             from airflow.contrib.hooks.cloudant_hook import CloudantHook
             return CloudantHook(cloudant_conn_id=self.conn_id)
         elif self.conn_type == 'jira':
             from airflow.contrib.hooks.jira_hook import JiraHook
             return JiraHook(jira_conn_id=self.conn_id)
         elif self.conn_type == 'redis':
             from airflow.contrib.hooks.redis_hook import RedisHook
             return RedisHook(redis_conn_id=self.conn_id)
         elif self.conn_type == 'wasb':
             from airflow.contrib.hooks.wasb_hook import WasbHook
             return WasbHook(wasb_conn_id=self.conn_id)
         elif self.conn_type == 'docker':
             from airflow.hooks.docker_hook import DockerHook
             return DockerHook(docker_conn_id=self.conn_id)
     except:
         pass
Example #6
0
 def execute(self, context=None):
     self.log.info('Executing SQL: {}'.format(self.sql))
     hook = VerticaHook(vertica_conn_id=self.vertica_conn_id)
     records = len(hook.get_records(self.sql))
     self.log.info('Returned rows: {}'.format(records))
     if records == 2 and not self.reverse:
         raise AirflowException('Row counts are not equal')
     elif records == 1 and self.reverse:
         raise AirflowException('Row counts are equal')
     elif records not in (1, 2):
         raise AirflowException(
             'Expected 1 or 2 rows, got {}'.format(records))
     self.log.info('Success!')
Example #7
0
 def execute(self, context=None):
     hook = VerticaHook(vertica_conn_id=self.vertica_conn_id)
     self.log.info('Executing SQL: {}'.format(self.sql))
     with closing(hook.get_conn()) as conn:
         with closing(conn.cursor()) as cur:
             res = cur.execute(self.sql)
             count = 0
             for _ in res.iterate():
                 count += 1
                 if not self.reverse and count > 0:
                     raise AirflowException('Result not empty')
             if self.reverse and count == 0:
                 raise AirflowException('Result is empty')
             self.log.info('Success!')
Example #8
0
    def execute(self, context):
        self.log.info('Running SQL query {}'.format(self.sql))
        hook = VerticaHook(vertica_conn_id=self.vertica_conn_id)
        result_set = hook.get_records(self.sql)
        condition = self.test(result_set, context=context)

        if condition:
            self.log.info('Proceeding with downstream tasks...')
            return

        self.log.info('Skipping downstream tasks...')

        downstream_tasks = context['task'].get_flat_relatives(upstream=False)
        self.log.debug("Downstream task_ids %s", downstream_tasks)

        if downstream_tasks:
            self.skip(context['dag_run'], context['ti'].execution_date,
                      downstream_tasks)

        self.log.info("Done.")
Example #9
0
 def __init__(self,
              source,
              target,
              date_column=None,
              truncate_date=False,
              direct=False,
              column_mapping=None,
              force_introspection=False,
              exclude=None,
              sql=None,
              *args,
              **kwargs):
     if column_mapping is None or force_introspection:
         exclude = exclude or []
         vertica_conn_id = kwargs.get('vertica_conn_id', 'vertica_default')
         hook = VerticaHook(vertica_conn_id=vertica_conn_id)
         columns = {
             c: c
             for (c,
                  ) in hook.get_records(GET_TABLE_COLUMNS, dict(
                      table=target)) if c not in exclude
         }
         self.log.debug('Target columns discovered: {}'.format(
             list(columns)))
     column_mapping = column_mapping or {}
     columns.update(column_mapping)
     source_columns, target_columns = list(zip(*columns.items()))
     _sql = sql or INSERT
     params = dict(source=source,
                   target=target,
                   date_column=date_column,
                   truncate_date=truncate_date,
                   direct=direct,
                   source_columns=', '.join(source_columns),
                   target_columns=', '.join(target_columns))
     super().__init__(sql=_sql, params=params, *args, **kwargs)
Example #10
0
    def execute(self, context):
        vertica = VerticaHook(vertica_conn_id=self.vertica_conn_id)
        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)

        tmpfile = None
        result = None

        selected_columns = []

        count = 0
        with closing(vertica.get_conn()) as conn:
            with closing(conn.cursor()) as cursor:
                cursor.execute(self.sql)
                selected_columns = [d.name for d in cursor.description]

                if self.bulk_load:
                    tmpfile = NamedTemporaryFile("w")

                    self.log.info(
                        "Selecting rows from Vertica to local file %s...",
                        tmpfile.name)
                    self.log.info(self.sql)

                    csv_writer = csv.writer(tmpfile,
                                            delimiter='\t',
                                            encoding='utf-8')
                    for row in cursor.iterate():
                        csv_writer.writerow(row)
                        count += 1

                    tmpfile.flush()
                else:
                    self.log.info("Selecting rows from Vertica...")
                    self.log.info(self.sql)

                    result = cursor.fetchall()
                    count = len(result)

                self.log.info("Selected rows from Vertica %s", count)

        if self.mysql_preoperator:
            self.log.info("Running MySQL preoperator...")
            mysql.run(self.mysql_preoperator)

        try:
            if self.bulk_load:
                self.log.info("Bulk inserting rows into MySQL...")
                with closing(mysql.get_conn()) as conn:
                    with closing(conn.cursor()) as cursor:
                        cursor.execute(
                            "LOAD DATA LOCAL INFILE '%s' INTO "
                            "TABLE %s LINES TERMINATED BY '\r\n' (%s)" %
                            (tmpfile.name, self.mysql_table,
                             ", ".join(selected_columns)))
                        conn.commit()
                tmpfile.close()
            else:
                self.log.info("Inserting rows into MySQL...")
                mysql.insert_rows(table=self.mysql_table,
                                  rows=result,
                                  target_fields=selected_columns)
            self.log.info("Inserted rows into MySQL %s", count)
        except (MySQLdb.Error, MySQLdb.Warning):
            self.log.info("Inserted rows into MySQL 0")
            raise

        if self.mysql_postoperator:
            self.log.info("Running MySQL postoperator...")
            mysql.run(self.mysql_postoperator)

        self.log.info("Done")
 def execute(self, context):
     self.log.info('Executing: %s', self.sql)
     hook = VerticaHook(vertica_conn_id=self.vertica_conn_id)
     hook.run(self.sql)
Example #12
0
 def get_hook(self):
     if self.conn_type == 'mysql':
         from airflow.hooks.mysql_hook import MySqlHook
         return MySqlHook(mysql_conn_id=self.conn_id)
     elif self.conn_type == 'google_cloud_platform':
         from airflow.gcp.hooks.bigquery import BigQueryHook
         return BigQueryHook(bigquery_conn_id=self.conn_id)
     elif self.conn_type == 'postgres':
         from airflow.hooks.postgres_hook import PostgresHook
         return PostgresHook(postgres_conn_id=self.conn_id)
     elif self.conn_type == 'pig_cli':
         from airflow.hooks.pig_hook import PigCliHook
         return PigCliHook(pig_cli_conn_id=self.conn_id)
     elif self.conn_type == 'hive_cli':
         from airflow.hooks.hive_hooks import HiveCliHook
         return HiveCliHook(hive_cli_conn_id=self.conn_id)
     elif self.conn_type == 'presto':
         from airflow.hooks.presto_hook import PrestoHook
         return PrestoHook(presto_conn_id=self.conn_id)
     elif self.conn_type == 'hiveserver2':
         from airflow.hooks.hive_hooks import HiveServer2Hook
         return HiveServer2Hook(hiveserver2_conn_id=self.conn_id)
     elif self.conn_type == 'sqlite':
         from airflow.hooks.sqlite_hook import SqliteHook
         return SqliteHook(sqlite_conn_id=self.conn_id)
     elif self.conn_type == 'jdbc':
         from airflow.hooks.jdbc_hook import JdbcHook
         return JdbcHook(jdbc_conn_id=self.conn_id)
     elif self.conn_type == 'mssql':
         from airflow.hooks.mssql_hook import MsSqlHook
         return MsSqlHook(mssql_conn_id=self.conn_id)
     elif self.conn_type == 'oracle':
         from airflow.hooks.oracle_hook import OracleHook
         return OracleHook(oracle_conn_id=self.conn_id)
     elif self.conn_type == 'vertica':
         from airflow.contrib.hooks.vertica_hook import VerticaHook
         return VerticaHook(vertica_conn_id=self.conn_id)
     elif self.conn_type == 'cloudant':
         from airflow.contrib.hooks.cloudant_hook import CloudantHook
         return CloudantHook(cloudant_conn_id=self.conn_id)
     elif self.conn_type == 'jira':
         from airflow.contrib.hooks.jira_hook import JiraHook
         return JiraHook(jira_conn_id=self.conn_id)
     elif self.conn_type == 'redis':
         from airflow.contrib.hooks.redis_hook import RedisHook
         return RedisHook(redis_conn_id=self.conn_id)
     elif self.conn_type == 'wasb':
         from airflow.contrib.hooks.wasb_hook import WasbHook
         return WasbHook(wasb_conn_id=self.conn_id)
     elif self.conn_type == 'docker':
         from airflow.hooks.docker_hook import DockerHook
         return DockerHook(docker_conn_id=self.conn_id)
     elif self.conn_type == 'azure_data_lake':
         from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
         return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id)
     elif self.conn_type == 'azure_cosmos':
         from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook
         return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id)
     elif self.conn_type == 'cassandra':
         from airflow.contrib.hooks.cassandra_hook import CassandraHook
         return CassandraHook(cassandra_conn_id=self.conn_id)
     elif self.conn_type == 'mongo':
         from airflow.contrib.hooks.mongo_hook import MongoHook
         return MongoHook(conn_id=self.conn_id)
     elif self.conn_type == 'gcpcloudsql':
         from airflow.gcp.hooks.cloud_sql import CloudSqlDatabaseHook
         return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id)
     elif self.conn_type == 'grpc':
         from airflow.contrib.hooks.grpc_hook import GrpcHook
         return GrpcHook(grpc_conn_id=self.conn_id)
     raise AirflowException("Unknown hook type {}".format(self.conn_type))
Example #13
0
def get_vertica_status(**kwargs):
    cur = VerticaHook('vertica').get_cursor()
    sql = "SELECT 1"
    cur.execute(sql)
    result = cur.fetchall()
    return result[0][0]
Example #14
0
 def execute(self, context):
     _log.info('Executing: ' + str(self.sql))
     hook = VerticaHook(vertica_conn_id=self.vertica_conn_id)
     hook.run(self.sql)
Example #15
0
 def execute(self, context=None):
     hook = VerticaHook(vertica_conn_id=self.vertica_conn_id)
     with hook.get_conn() as conn:
         with conn.cursor() as cur:
             logging.info(self.sql)
             cur.copy(self.sql, self.source(**context))