def execute(self, context): source_hook = BaseHook.get_hook(self.source_conn_id) self.log.info("Extracting data from %s", self.source_conn_id) self.log.info("Executing: \n %s", self.sql) results = source_hook.get_records(self.sql) destination_hook = BaseHook.get_hook(self.destination_conn_id) if self.preoperator: self.log.info("Running preoperator") self.log.info(self.preoperator) destination_hook.run(self.preoperator) self.log.info("Inserting rows into %s", self.destination_conn_id) destination_hook.insert_rows(table=self.destination_table, rows=results)
def execute(self, context): source_hook = BaseHook.get_hook(self.source_conn_id) logging.info("Extracting data from {}".format(self.source_conn_id)) logging.info("Executing: \n" + self.sql) results = source_hook.get_records(self.sql) destination_hook = BaseHook.get_hook(self.destination_conn_id) if self.preoperator: logging.info("Running preoperator") logging.info(self.preoperator) destination_hook.run(self.preoperator) logging.info("Inserting rows into {}".format(self.destination_conn_id)) destination_hook.insert_rows(table=self.destination_table, rows=results)
def execute(self, context): source_hook = BaseHook.get_hook(self.source_conn_id) logging.info("Extracting data from {}".format(self.source_conn_id)) logging.info("Executing: \n" + self.sql) results = source_hook.get_records(self.sql) destination_hook = TeradataHook( teradata_conn_id=self.destination_conn_id) if self.preoperator: logging.info("Running preoperator") logging.info(self.preoperator) destination_hook.run(self.preoperator) if self.batch: logging.info( "Inserting {} rows into {} with a batch size of {} rows". format(len(results), self.destination_conn_id, self.batch_size)) destination_hook.bulk_insert_rows( table=self.destination_table, rows=iter(results), commit_every=self.batch_size, unicode_source=self.unicode_source) else: logging.info("Inserting {} rows into {}".format( len(results), self.destination_conn_id)) destination_hook.insert_rows(table=self.destination_table, rows=iter(results), commit_every=1000, unicode_source=self.unicode_source)
def execute(self, context): source_hook = BaseHook.get_hook(self.source_conn_id) self.logger.info("Extracting data from %s", self.source_conn_id) self.logger.info("Executing: \n %s", self.sql) results = source_hook.get_records(self.sql) destination_hook = BaseHook.get_hook(self.destination_conn_id) if self.preoperator: self.logger.info("Running preoperator") self.logger.info(self.preoperator) destination_hook.run(self.preoperator) self.logger.info("Inserting rows into %s", self.destination_conn_id) destination_hook.insert_rows(table=self.destination_table, rows=results)
def _setup_source_conn(self, source_conn_id, source_bucket_name=None): """ Retrieve connection based on source_conn_id. In case of s3 it also configures the bucket. Validates that connection id belongs to supported connection type. :param source_conn_id: :param source_bucket_name: """ self.source_conn = BaseHook.get_hook(source_conn_id) self.source_conn_id = source_conn_id # Workaround for getting hook in case of s3 connection # This is needed because get_hook silently returns None for s3 connections # See https://issues.apache.org/jira/browse/AIRFLOW-2316 for more info connection = BaseHook._get_connection_from_env(source_conn_id) self.log.info(connection.extra_dejson) if connection.conn_type == 's3': self.log.info("Setting up s3 connection {0}".format(source_conn_id)) self.source_conn = S3Hook(aws_conn_id=source_conn_id) # End Workaround if source_bucket_name is None: raise AttributeError("Missing source bucket for s3 connection") self.source_bucket_name = source_bucket_name if not isinstance(self.source_conn, DbApiHook) and not isinstance(self.source_conn, S3Hook): raise AttributeError( "Only s3_csv, local and sql connection types are allowed, not {0}".format(type(self.source_conn)))
def test_check_operators(self): conn_id = "sqlite_default" captain_hook = BaseHook.get_hook(conn_id=conn_id) # quite funny :D captain_hook.run("CREATE TABLE operator_test_table (a, b)") captain_hook.run("insert into operator_test_table values (1,2)") op = CheckOperator( task_id='check', sql="select count(*) from operator_test_table", conn_id=conn_id, dag=self.dag) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) op = ValueCheckOperator( task_id='value_check', pass_value=95, tolerance=0.1, conn_id=conn_id, sql="SELECT 100", dag=self.dag) op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) captain_hook.run("drop table operator_test_table")
def hook(self) -> BaseHook: try: return BaseHook.get_hook(self.conn_id) except AirflowException as e: if 'Unknown hook type' not in str(e): raise return self.get_custom_hook()
def execute(self, context): source_hook = BaseHook.get_hook(self.source_conn_id) _log.info("Extracting data from {}".format(self.source_conn_id)) _log.info("Executing: \n" + self.sql) results = source_hook.get_records(self.sql) destination_hook = BaseHook.get_hook(self.destination_conn_id) if self.preoperator: _log.info("Running preoperator") _log.info(self.preoperator) destination_hook.run(self.preoperator) _log.info("Inserting rows into {}".format(self.destination_conn_id)) destination_hook.insert_rows(table=self.destination_table, rows=results)
def get_db_hook(self): """ Get the database hook for the connection. :return: the database hook object. :rtype: DbApiHook """ return BaseHook.get_hook(conn_id=self.conn_id)
def load(mean_fare_per_class: dict,pivot_dataset: dict): hook = BaseHook.get_hook('airflow') mean_fare_per_class_df = pd.read_json(mean_fare_per_class) mean_fare_per_class_df.to_sql('mean_fare_per_class_table', hook.get_sqlalchemy_engine()) pivot_dataset_df = pd.read_json(pivot_dataset) pivot_dataset_df.to_sql('pivot_dataset_table', hook.get_sqlalchemy_engine())
def hook(input): # input = (table_name, df) # data_warehouse hook = BaseHook.get_hook(conn_id='data_warehouse') postgreSQLConnection = hook.get_sqlalchemy_engine() postgreSQLTable = input[0] df = pd.read_json(input[1]) try: df.to_sql(postgreSQLTable, postgreSQLConnection, if_exists='fail') except ValueError as vx: print(vx) except Exception as ex: print(ex) else: print("PostgreSQL Table %s has been created successfully." % postgreSQLTable)
def exec_sql(conn_id, query, parameters=None, fetch=True): logging.info(f'Executing: {query}') hook = BaseHook.get_hook(conn_id) if fetch: res = hook.get_records(query, parameters=parameters) else: res = hook.run(query, parameters=parameters) if hasattr(hook, 'conn'): for output in hook.conn.notices: logging.info(output) return res
def connection_operator(**context): hook = BaseHook.get_hook('airflow') con = context mean_fare_per_class_df = context['task_instance'].xcom_pull(task_ids="mean_fare_per_class", key='mean_fare_per_class_xcom_key') mean_fare_per_class_df = pd.read_json(mean_fare_per_class_df) mean_fare_per_class_table = f'mean_fare_per_class_{ con["dag_run"].run_id }' mean_fare_per_class_df.to_sql(mean_fare_per_class_table, hook.get_sqlalchemy_engine()) pivot_dataset_df = context['task_instance'].xcom_pull(task_ids="pivot_dataset", key='pivot_dataset_xcom_key') pivot_dataset_df = pd.read_json(pivot_dataset_df) pivot_dataset_table = f'pivot_dataset_{ con["dag_run"].run_id }' pivot_dataset_df.to_sql(pivot_dataset_table, hook.get_sqlalchemy_engine())
def execute(self, context): source_hook = BaseHook.get_hook(self.source_conn_id) logging.info("Extracting data from {}".format(self.source_conn_id)) logging.info("Executing: \n" + self.sql) results = source_hook.get_records(self.sql) destination_hook = TeradataHook(teradata_conn_id=self.destination_conn_id) if self.preoperator: logging.info("Running preoperator") logging.info(self.preoperator) destination_hook.run(self.preoperator) if self.batch: logging.info("Inserting {} rows into {} with a batch size of {} rows".format(len(results), self.destination_conn_id, self.batch_size)) destination_hook.bulk_insert_rows(table=self.destination_table, rows=iter(results), commit_every=self.batch_size, unicode_source=self.unicode_source) else: logging.info("Inserting {} rows into {}".format(len(results), self.destination_conn_id)) destination_hook.insert_rows(table=self.destination_table, rows=iter(results), commit_every=1000, unicode_source=self.unicode_source )
def execute(self, context): ## read csv file csvfile = os.path.join(self.CSVDIR, self.upload_csvfile) csvdf = pd.read_csv(csvfile) ## convert nan type to None csvdf = csvdf.where((pd.notnull(csvdf)), None) # rows for insert csvdf_rows = list(csvdf.itertuples(index=False, name=None)) self.log.info("Loading csv file into SQLDB") db_hook = BaseHook.get_hook(conn_id=self.upload_conn_id) # insert method1: insert_rows # db_hook.insert_rows(table=self.upload_tbl_list, rows=csvdf_rows) # insert method2: pd.to_sql extras = BaseHook.get_connection(conn_id=self.upload_conn_id).extra sql_conn = create_engine(db_hook.get_uri(), connect_args=json.loads(extras)) csvdf.to_sql(self.upload_tbl_list, con=sql_conn, index=False, if_exists='append')
def get_db_hook(self): """ Returns DB hook """ return BaseHook.get_hook(conn_id=self.conn_id)
def get_db_hook(self): return BaseHook.get_hook(conn_id=self.conn_id)
def execute(self, context): self.log.info('Executing: %s', self.sql) self.hook = BaseHook.get_hook(self.conn_id) self.hook.run(self.sql, self.autocommit, parameters=self.parameters) for output in self.hook.conn.notices: self.log.info(output)
def get_tblnm_list(conn_id): hookbs = BaseHook(source=None) ## conn_type # conn_type = hookbs.get_connection(conn_id=conn_id).conn_type db_hook = hookbs.get_hook(conn_id=conn_id) return db_hook.get_sqlalchemy_engine().table_names()
def connection_operator(**context): mssql_hook = BaseHook.get_hook('airflow') hook.get_records('SELECT * FROM connection')
def execute(self, context): t_func = { 'timestamp': lambda val: pendulum.parse(val).timestamp(), 'datetime': lambda val: pendulum.parse(val).to_datetime_string() } source_hook = BaseHook.get_hook(self.source_conn_id) destination_hook = BaseHook.get_hook(self.destination_conn_id) fields, constraints = db_util.get_table_fields(self.source_conn_id, self.source_table) fields = db_util.convert_data_types(fields, src_db='mysql', dest_db='postgres') db_util.create_table(self.destination_conn_id, self.destination_table, fields, constraints) condition_lst, conditions = [], '' for filter, data_type in self.filters.items(): date_start = t_func[data_type](self.prev_exec_date) date_end = t_func[data_type](self.exec_date) if type(date_start) == str: condition_lst.append(f''' ({filter} > '{date_start}' AND {filter} <= '{date_end}') ''') else: condition_lst.append(f''' ({filter} > {date_start} AND {filter} <= {date_end}) ''') if condition_lst: conditions = 'OR '.join(condition_lst) conditions = f'WHERE {conditions}' sql = f''' SELECT * FROM {self.source_table} {conditions} ''' self.log.info(f'Extracting data from {self.source_conn_id}') self.log.info(f'Executing: \n {sql}') cur = source_hook.get_cursor() cur.execute(sql) if self.preoperator: self.log.info('Running preoperator') self.log.info(self.preoperator) destination_hook.run(self.preoperator, autocommit=True) self.log.info(f'Inserting rows into {self.destination_conn_id}') if self.insert_dialect == 'postgres': if self.filters: db_util.pg_bulk_load(self.destination_conn_id, cur, self.destination_table, fields, constraints) else: db_util.pg_bulk_load(self.destination_conn_id, cur, self.destination_table, fields, constraints=None) # if self.filters: # self.log.info('Executing Upsert Operation') # db_util.pg_load(self.destination_conn_id, cur, self.destination_table, fields, constraints) # # elif self.bulk_load: # db_util.pg_bulk_load(self.destination_conn_id, cur, self.destination_table, fields, constraints) # else: # self.log.info('Executing Normal Load Operation') # db_util.pg_load(self.destination_conn_id, cur, self.destination_table, fields) else: self.log.info('Executing Normal Load Operation') destination_hook.insert_rows(table=self.destination_table, rows=cur)
def connection_operator(**context): hook = BaseHook.get_hook('airflow') import pdb; pdb.set_trace()