def execute(self, context):
        hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id)
        logging.info("Extracting data from Hive")
        logging.info(self.sql)

        if self.bulk_load:
            tmpfile = NamedTemporaryFile()
            hive.to_csv(self.sql, tmpfile.name, delimiter='\t',
                lineterminator='\n', output_header=False)
        else:
            results = hive.get_records(self.sql)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        if self.mysql_preoperator:
            logging.info("Running MySQL preoperator")
            mysql.run(self.mysql_preoperator)

        logging.info("Inserting rows into MySQL")

        if self.bulk_load:
            mysql.bulk_load(table=self.mysql_table, tmp_file=tmpfile.name)
            tmpfile.close()
        else:
            mysql.insert_rows(table=self.mysql_table, rows=results)

        if self.mysql_postoperator:
            logging.info("Running MySQL postoperator")
            mysql.run(self.mysql_postoperator)

        logging.info("Done.")
    def execute(self, context):
        hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id)
        logging.info("Extracting data from Hive")
        logging.info(self.sql)

        if self.bulk_load:
            tmpfile = NamedTemporaryFile()
            hive.to_csv(self.sql,
                        tmpfile.name,
                        delimiter='\t',
                        lineterminator='\n',
                        output_header=False)
        else:
            results = hive.get_records(self.sql)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        if self.mysql_preoperator:
            logging.info("Running MySQL preoperator")
            mysql.run(self.mysql_preoperator)

        logging.info("Inserting rows into MySQL")

        if self.bulk_load:
            mysql.bulk_load(table=self.mysql_table, tmp_file=tmpfile.name)
            tmpfile.close()
        else:
            mysql.insert_rows(table=self.mysql_table, rows=results)

        if self.mysql_postoperator:
            logging.info("Running MySQL postoperator")
            mysql.run(self.mysql_postoperator)

        logging.info("Done.")
    def test_mysql_hook_test_bulk_load(self):
        records = ("foo", "bar", "baz")

        import tempfile
        with tempfile.NamedTemporaryFile() as t:
            t.write("\n".join(records).encode('utf8'))
            t.flush()

            from airflow.hooks.mysql_hook import MySqlHook
            h = MySqlHook('airflow_ci')
            with h.get_conn() as c:
                c.execute("""
                    CREATE TABLE IF NOT EXISTS test_airflow (
                        dummy VARCHAR(50)
                    )
                """)
                c.execute("TRUNCATE TABLE test_airflow")
                h.bulk_load("test_airflow", t.name)
                c.execute("SELECT dummy FROM test_airflow")
                results = tuple(result[0] for result in c.fetchall())
                self.assertEqual(sorted(results), sorted(records))
Exemple #4
0
    def test_mysql_hook_test_bulk_load(self):
        records = ("foo", "bar", "baz")

        import tempfile
        with tempfile.NamedTemporaryFile() as t:
            t.write("\n".join(records).encode('utf8'))
            t.flush()

            from airflow.hooks.mysql_hook import MySqlHook
            h = MySqlHook('airflow_ci')
            with h.get_conn() as c:
                c.execute("""
                    CREATE TABLE IF NOT EXISTS test_airflow (
                        dummy VARCHAR(50)
                    )
                """)
                c.execute("TRUNCATE TABLE test_airflow")
                h.bulk_load("test_airflow", t.name)
                c.execute("SELECT dummy FROM test_airflow")
                results = tuple(result[0] for result in c.fetchall())
                self.assertEqual(sorted(results), sorted(records))
def readS3FilesAndLoadtoMySql(**kwargs):
    """
    Read Data from S3 Files and load to Mysql
    :param kwargs:
    :return:
    """
    s3_files = fetchFilesBasedonPattern(**kwargs)
    
    tmp_trg_file_path = "/tmp/s3mysqlload_" + str(round(datetime.now().timestamp())) + "/"
    if s3_files is None:
        raise Exception("No Files are Available to process")
    else:
        files_df = pd.DataFrame()
        print(type(s3_files))
        data = ",".join(s3_files)
        kwargs['ti'].xcom_push(key='s3_data_files', value=data)
        s3_client = generateS3Hook(kwargs["aws_conn_id"])

        for path in s3_files:
            file_name = getFileName(path)
            if (file_name.lower().__contains__(".csv")):
                
                files_df = files_df.append(pd.read_csv(
                    io.BytesIO(s3_client.get_key(key=path, bucket_name=kwargs['src_bucket']).get()['Body'].read())))
            elif file_name.lower().__contains__(".json"):
                files_df = files_df.append(pd.read_json(
                    io.BytesIO(s3_client.get_key(key=path, bucket_name=kwargs['src_bucket']).get()['Body'].read())))

        if len(files_df) > 0:

            if not os.path.exists(tmp_trg_file_path):
                os.makedirs(tmp_trg_file_path)
            file_path = tmp_trg_file_path + str(round(datetime.now().timestamp())) + ".tsv"
            files_df.to_csv(file_path, sep="\t", index=False,header=False,line_terminator="\n")
            mysql_client = MySqlHook(mysql_conn_id=kwargs["mysql_conn"])
            mysql_client.bulk_load(table=kwargs["schema"]+"."+kwargs["table"], tmp_file=file_path)
            
            shutil.rmtree(tmp_trg_file_path)
        else:
            raise Exception("Source Files are Empty")
Exemple #6
0
def bulk_load_sql(table_name, **kwargs):
    local_filepath = kwargs['ti'].xcom_pull(task_ids='download_file')
    conn = MySqlHook(conn_name_attr='ib_sql')
    conn.bulk_load(table_name, local_filepath)
    return table_name
class LoadOltpOperator(BaseOperator):
    """
    This operator is for updating/deleting/inserting records in MySQL oltp table
    First it finds keys for records from source .csv
    Then it deletes records from oltp table with those keys
    Then it removes all records from dataframe that have action flag "C" -> closed
    Then it writes modified dataframe to .csv file with tab-delimited format
    Then it bulk loads that .csv file into MySQL oltp table    

    :param src_file: Source .csv file path
    :type src_file: str    
    :param src_file_attr: source table metadata
    :type src_file_attr: dictionary
    :param tgt_table_conn_id: reference to the oltp MySQL database
    :type tgt_table_conn_id: string
    :param db_table: target table
    :type db_table: str    
    :param df: Final dataframe which will be written to .csv
    :type df: pandas dataframe
    :param output_filename: Full name of output file
    :type output_filename: .csv
    """

    #template_fields = ('sql', 'parameters', 'mysql_table')
    #template_ext = ('.sql',)
    ui_color = '#ededed'

    @apply_defaults
    def __init__(self,
                 src_file="/usr/local/airflow/csv_lz_2/cleaned_depts.csv",
                 src_file_attr={
                     "action": "action",
                     "key": "department_id"
                 },
                 tgt_table_conn_id='mysql_default',
                 db_table=None,
                 df=None,
                 output_filename="/usr/local/airflow/csv_lz_2/depts_td.csv",
                 *args,
                 **kwargs):
        super(LoadOltpOperator, self).__init__(*args, **kwargs)
        self.src_file = src_file
        self.src_file_attr = src_file_attr
        self.tgt_table_conn_id = tgt_table_conn_id
        self.db_table = db_table
        self.conn = MySqlHook(conn_name_attr=tgt_table_conn_id)
        self.output_filename = output_filename

    def read_into_df(self, file_):
        """
          Returns pandas df from .csv file
        """
        return pd.read_csv(file_, sep=",", encoding="utf-8")

    def keys_as_tuples(self, df):
        """
          Returns keys from dataframe as array of tuples
          This will be needed to delete records from MySql table
        """
        keys_df = df[[self.src_file_attr["key"]]]
        return [tuple(x) for x in keys_df.values]

    def delete_db_records(self, keys):
        """
          Deletes records from MySQL table
        """
        delete_sql = """DELETE FROM {table_name} WHERE {key} = %s""".format(
            table_name=self.db_table, key=self.src_file_attr["key"])
        for key in keys:
            self.conn.run(delete_sql, autocommit=True, parameters=key)

    def write_to_file(self, df):
        """
         First drop "action" column, then write dataframe to .csv file
        """
        df.drop(self.src_file_attr["action"], axis=1, inplace=True)
        df.to_csv(self.output_filename,
                  sep='\t',
                  header=False,
                  index=False,
                  encoding='utf-8')

    def execute(self, context):
        """
          Read src file into data frame
          Return keys as an array of tuples
          Delete records by key from db
          Filter out closed records from dataframe
        """
        logging.info('Executing: Reading ' + self.src_file + ' into dataframe')
        self.df = self.read_into_df(self.src_file)
        logging.info('Executing: Create tuples for keys ')
        keys = self.keys_as_tuples(self.df)
        logging.info('Executing: Deleting records from table ' + self.db_table)
        self.delete_db_records(keys)
        logging.info('Executing: Filter out closed records')
        self.df = self.df.loc[self.df[self.src_file_attr["action"]] != 'C']
        # func: write dataframe into tab-delimited .csv file
        self.write_to_file(self.df)
        self.conn.bulk_load(self.db_table, self.output_filename)