def execute(self, context):
        source_hook = BaseHook.get_hook(self.source_conn_id)

        self.log.info("Extracting data from %s", self.source_conn_id)
        self.log.info("Executing: \n %s", self.sql)
        results = source_hook.get_records(self.sql)

        destination_hook = BaseHook.get_hook(self.destination_conn_id)
        if self.preoperator:
            self.log.info("Running preoperator")
            self.log.info(self.preoperator)
            destination_hook.run(self.preoperator)

        self.log.info("Inserting rows into %s", self.destination_conn_id)
        destination_hook.insert_rows(table=self.destination_table, rows=results)
Exemple #2
0
    def execute(self, context):
        source_hook = BaseHook.get_hook(self.source_conn_id)

        logging.info("Extracting data from {}".format(self.source_conn_id))
        logging.info("Executing: \n" + self.sql)
        results = source_hook.get_records(self.sql)

        destination_hook = BaseHook.get_hook(self.destination_conn_id)
        if self.preoperator:
            logging.info("Running preoperator")
            logging.info(self.preoperator)
            destination_hook.run(self.preoperator)

        logging.info("Inserting rows into {}".format(self.destination_conn_id))
        destination_hook.insert_rows(table=self.destination_table, rows=results)
Exemple #3
0
    def execute(self, context):
        source_hook = BaseHook.get_hook(self.source_conn_id)

        logging.info("Extracting data from {}".format(self.source_conn_id))
        logging.info("Executing: \n" + self.sql)
        results = source_hook.get_records(self.sql)

        destination_hook = TeradataHook(
            teradata_conn_id=self.destination_conn_id)
        if self.preoperator:
            logging.info("Running preoperator")
            logging.info(self.preoperator)
            destination_hook.run(self.preoperator)

        if self.batch:
            logging.info(
                "Inserting {} rows into {} with a batch size of {} rows".
                format(len(results), self.destination_conn_id,
                       self.batch_size))
            destination_hook.bulk_insert_rows(
                table=self.destination_table,
                rows=iter(results),
                commit_every=self.batch_size,
                unicode_source=self.unicode_source)
        else:
            logging.info("Inserting {} rows into {}".format(
                len(results), self.destination_conn_id))
            destination_hook.insert_rows(table=self.destination_table,
                                         rows=iter(results),
                                         commit_every=1000,
                                         unicode_source=self.unicode_source)
    def execute(self, context):
        source_hook = BaseHook.get_hook(self.source_conn_id)

        self.logger.info("Extracting data from %s", self.source_conn_id)
        self.logger.info("Executing: \n %s", self.sql)
        results = source_hook.get_records(self.sql)

        destination_hook = BaseHook.get_hook(self.destination_conn_id)
        if self.preoperator:
            self.logger.info("Running preoperator")
            self.logger.info(self.preoperator)
            destination_hook.run(self.preoperator)

        self.logger.info("Inserting rows into %s", self.destination_conn_id)
        destination_hook.insert_rows(table=self.destination_table,
                                     rows=results)
    def _setup_source_conn(self, source_conn_id, source_bucket_name=None):
        """
        Retrieve connection based on source_conn_id. In case of s3 it also configures the bucket.
        Validates that connection id belongs to supported connection type.
        :param source_conn_id:
        :param source_bucket_name:
        """
        self.source_conn = BaseHook.get_hook(source_conn_id)
        self.source_conn_id = source_conn_id

        # Workaround for getting hook in case of s3 connection
        # This is needed because get_hook silently returns None for s3 connections
        # See https://issues.apache.org/jira/browse/AIRFLOW-2316 for more info
        connection = BaseHook._get_connection_from_env(source_conn_id)
        self.log.info(connection.extra_dejson)
        if connection.conn_type == 's3':
            self.log.info("Setting up s3 connection {0}".format(source_conn_id))
            self.source_conn = S3Hook(aws_conn_id=source_conn_id)
            # End Workaround
            if source_bucket_name is None:
                raise AttributeError("Missing source bucket for s3 connection")
            self.source_bucket_name = source_bucket_name

        if not isinstance(self.source_conn, DbApiHook) and not isinstance(self.source_conn, S3Hook):
            raise AttributeError(
                "Only s3_csv, local and sql connection types are allowed, not {0}".format(type(self.source_conn)))
Exemple #6
0
    def test_check_operators(self):

        conn_id = "sqlite_default"

        captain_hook = BaseHook.get_hook(conn_id=conn_id)  # quite funny :D
        captain_hook.run("CREATE TABLE operator_test_table (a, b)")
        captain_hook.run("insert into operator_test_table values (1,2)")

        op = CheckOperator(
            task_id='check',
            sql="select count(*) from operator_test_table",
            conn_id=conn_id,
            dag=self.dag)
        op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)

        op = ValueCheckOperator(
            task_id='value_check',
            pass_value=95,
            tolerance=0.1,
            conn_id=conn_id,
            sql="SELECT 100",
            dag=self.dag)
        op.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True)

        captain_hook.run("drop table operator_test_table")
Exemple #7
0
 def hook(self) -> BaseHook:
     try:
         return BaseHook.get_hook(self.conn_id)
     except AirflowException as e:
         if 'Unknown hook type' not in str(e):
             raise
         return self.get_custom_hook()
Exemple #8
0
    def execute(self, context):
        source_hook = BaseHook.get_hook(self.source_conn_id)

        _log.info("Extracting data from {}".format(self.source_conn_id))
        _log.info("Executing: \n" + self.sql)
        results = source_hook.get_records(self.sql)

        destination_hook = BaseHook.get_hook(self.destination_conn_id)
        if self.preoperator:
            _log.info("Running preoperator")
            _log.info(self.preoperator)
            destination_hook.run(self.preoperator)

        _log.info("Inserting rows into {}".format(self.destination_conn_id))
        destination_hook.insert_rows(table=self.destination_table,
                                     rows=results)
 def get_db_hook(self):
     """
     Get the database hook for the connection.
     :return: the database hook object.
     :rtype: DbApiHook
     """
     return BaseHook.get_hook(conn_id=self.conn_id)
Exemple #10
0
    def load(mean_fare_per_class: dict,pivot_dataset: dict):
        hook = BaseHook.get_hook('airflow')

        mean_fare_per_class_df = pd.read_json(mean_fare_per_class)
        mean_fare_per_class_df.to_sql('mean_fare_per_class_table', hook.get_sqlalchemy_engine())


        pivot_dataset_df = pd.read_json(pivot_dataset)
        pivot_dataset_df.to_sql('pivot_dataset_table', hook.get_sqlalchemy_engine())
 def hook(input):  # input = (table_name, df)
     # data_warehouse
     hook = BaseHook.get_hook(conn_id='data_warehouse')
     postgreSQLConnection = hook.get_sqlalchemy_engine()
     postgreSQLTable = input[0]
     df = pd.read_json(input[1])
     try:
         df.to_sql(postgreSQLTable, postgreSQLConnection, if_exists='fail')
     except ValueError as vx:
         print(vx)
     except Exception as ex:
         print(ex)
     else:
         print("PostgreSQL Table %s has been created successfully." %
               postgreSQLTable)
Exemple #12
0
def exec_sql(conn_id, query, parameters=None, fetch=True):
    logging.info(f'Executing: {query}')

    hook = BaseHook.get_hook(conn_id)

    if fetch:
        res = hook.get_records(query, parameters=parameters)
    else:
        res = hook.run(query, parameters=parameters)

    if hasattr(hook, 'conn'):
        for output in hook.conn.notices:
            logging.info(output)

    return res
Exemple #13
0
def connection_operator(**context):
    hook = BaseHook.get_hook('airflow')

    con = context

    mean_fare_per_class_df = context['task_instance'].xcom_pull(task_ids="mean_fare_per_class", key='mean_fare_per_class_xcom_key')
    mean_fare_per_class_df = pd.read_json(mean_fare_per_class_df)
    mean_fare_per_class_table = f'mean_fare_per_class_{ con["dag_run"].run_id }'
    mean_fare_per_class_df.to_sql(mean_fare_per_class_table, hook.get_sqlalchemy_engine())


    pivot_dataset_df = context['task_instance'].xcom_pull(task_ids="pivot_dataset",
                                                                key='pivot_dataset_xcom_key')
    pivot_dataset_df = pd.read_json(pivot_dataset_df)
    pivot_dataset_table = f'pivot_dataset_{ con["dag_run"].run_id }'
    pivot_dataset_df.to_sql(pivot_dataset_table, hook.get_sqlalchemy_engine())
    def execute(self, context):
        source_hook = BaseHook.get_hook(self.source_conn_id)

        logging.info("Extracting data from {}".format(self.source_conn_id))
        logging.info("Executing: \n" + self.sql)
        results = source_hook.get_records(self.sql)

        destination_hook = TeradataHook(teradata_conn_id=self.destination_conn_id)
        if self.preoperator:
            logging.info("Running preoperator")
            logging.info(self.preoperator)
            destination_hook.run(self.preoperator)

        if self.batch:
            logging.info("Inserting {} rows into {} with a batch size of {} rows".format(len(results), self.destination_conn_id, self.batch_size))
            destination_hook.bulk_insert_rows(table=self.destination_table, rows=iter(results), commit_every=self.batch_size,  unicode_source=self.unicode_source)
        else:
            logging.info("Inserting {} rows into {}".format(len(results), self.destination_conn_id))
            destination_hook.insert_rows(table=self.destination_table, rows=iter(results), commit_every=1000, unicode_source=self.unicode_source )
Exemple #15
0
    def execute(self, context):
        ## read csv file
        csvfile = os.path.join(self.CSVDIR, self.upload_csvfile)
        csvdf = pd.read_csv(csvfile)

        ## convert nan type to None
        csvdf = csvdf.where((pd.notnull(csvdf)), None)
        # rows for insert
        csvdf_rows = list(csvdf.itertuples(index=False, name=None))

        self.log.info("Loading csv file into SQLDB")
        db_hook = BaseHook.get_hook(conn_id=self.upload_conn_id)

        # insert method1: insert_rows
        # db_hook.insert_rows(table=self.upload_tbl_list, rows=csvdf_rows)

        # insert method2: pd.to_sql
        extras = BaseHook.get_connection(conn_id=self.upload_conn_id).extra
        sql_conn = create_engine(db_hook.get_uri(),
                                 connect_args=json.loads(extras))
        csvdf.to_sql(self.upload_tbl_list,
                     con=sql_conn,
                     index=False,
                     if_exists='append')
Exemple #16
0
 def get_db_hook(self):
     """
     Returns DB hook
     """
     return BaseHook.get_hook(conn_id=self.conn_id)
 def get_db_hook(self):
     return BaseHook.get_hook(conn_id=self.conn_id)
Exemple #18
0
 def execute(self, context):
     self.log.info('Executing: %s', self.sql)
     self.hook = BaseHook.get_hook(self.conn_id)
     self.hook.run(self.sql, self.autocommit, parameters=self.parameters)
     for output in self.hook.conn.notices:
         self.log.info(output)
Exemple #19
0
def get_tblnm_list(conn_id):
    hookbs = BaseHook(source=None)
    ## conn_type
    # conn_type = hookbs.get_connection(conn_id=conn_id).conn_type
    db_hook = hookbs.get_hook(conn_id=conn_id)
    return db_hook.get_sqlalchemy_engine().table_names()
Exemple #20
0
def connection_operator(**context):
    mssql_hook = BaseHook.get_hook('airflow')
    hook.get_records('SELECT * FROM connection')
Exemple #21
0
 def get_db_hook(self):
     return BaseHook.get_hook(conn_id=self.conn_id)
Exemple #22
0
    def execute(self, context):
        t_func = {
            'timestamp': lambda val: pendulum.parse(val).timestamp(),
            'datetime': lambda val: pendulum.parse(val).to_datetime_string()
        }

        source_hook = BaseHook.get_hook(self.source_conn_id)
        destination_hook = BaseHook.get_hook(self.destination_conn_id)

        fields, constraints = db_util.get_table_fields(self.source_conn_id,
                                                       self.source_table)
        fields = db_util.convert_data_types(fields,
                                            src_db='mysql',
                                            dest_db='postgres')
        db_util.create_table(self.destination_conn_id, self.destination_table,
                             fields, constraints)

        condition_lst, conditions = [], ''

        for filter, data_type in self.filters.items():
            date_start = t_func[data_type](self.prev_exec_date)
            date_end = t_func[data_type](self.exec_date)

            if type(date_start) == str:
                condition_lst.append(f'''
                ({filter} > '{date_start}' AND 
                {filter} <= '{date_end}')
                ''')
            else:
                condition_lst.append(f'''
                ({filter} > {date_start} AND 
                {filter} <= {date_end})
                ''')

        if condition_lst:
            conditions = 'OR '.join(condition_lst)
            conditions = f'WHERE {conditions}'

        sql = f'''
        SELECT *
        FROM {self.source_table}
        {conditions}
        '''

        self.log.info(f'Extracting data from {self.source_conn_id}')
        self.log.info(f'Executing: \n {sql}')

        cur = source_hook.get_cursor()
        cur.execute(sql)

        if self.preoperator:
            self.log.info('Running preoperator')
            self.log.info(self.preoperator)
            destination_hook.run(self.preoperator, autocommit=True)

        self.log.info(f'Inserting rows into {self.destination_conn_id}')

        if self.insert_dialect == 'postgres':

            if self.filters:
                db_util.pg_bulk_load(self.destination_conn_id, cur,
                                     self.destination_table, fields,
                                     constraints)
            else:
                db_util.pg_bulk_load(self.destination_conn_id,
                                     cur,
                                     self.destination_table,
                                     fields,
                                     constraints=None)

            # if self.filters:
            #     self.log.info('Executing Upsert Operation')
            #     db_util.pg_load(self.destination_conn_id, cur, self.destination_table, fields, constraints)
            #
            # elif self.bulk_load:
            #     db_util.pg_bulk_load(self.destination_conn_id, cur, self.destination_table, fields, constraints)
            # else:
            #     self.log.info('Executing Normal Load Operation')
            #     db_util.pg_load(self.destination_conn_id, cur, self.destination_table, fields)
        else:
            self.log.info('Executing Normal Load Operation')
            destination_hook.insert_rows(table=self.destination_table,
                                         rows=cur)
Exemple #23
0
def connection_operator(**context):
    hook = BaseHook.get_hook('airflow')
    import pdb; pdb.set_trace()