Ejemplo n.º 1
0
from airflow.providers.apache.sqoop.operators.sqoop import SqoopOperator
from airflow.operators.email import EmailOperator

from datetime import datetime, timedelta
import time
import requests
import json

#Define default arguments for dag process
default_args = {
    "owner": "airflow",
    "email_on_failure": True,
    "email_on_retry": True,
    "email": "*****@*****.**",
    "retry": 1,
    "retry_delay": timedelta(minutes=5)
}

with DAG("twitter",
         start_date=datetime(2021, 1, 1),
         schedule_interval="*/10 * * * *",
         default_args=default_args,
         catchup=False) as dag:

    #Sqoop Operator
    hive_to_mysql = SqoopOperator(task_id="hive_to_mysql",
                                  conn_id="sqoop_conn",
                                  cmd_type="export",
                                  table="twitter",
                                  hcatalog_table="twitter")
Ejemplo n.º 2
0
    def test_execute(self):
        """
        Tests to verify values of the SqoopOperator match that passed in from the config.
        """
        operator = SqoopOperator(task_id='sqoop_job',
                                 dag=self.dag,
                                 **self._config)

        self.assertEqual(self._config['conn_id'], operator.conn_id)
        self.assertEqual(self._config['query'], operator.query)
        self.assertEqual(self._config['cmd_type'], operator.cmd_type)
        self.assertEqual(self._config['table'], operator.table)
        self.assertEqual(self._config['target_dir'], operator.target_dir)
        self.assertEqual(self._config['append'], operator.append)
        self.assertEqual(self._config['file_type'], operator.file_type)
        self.assertEqual(self._config['num_mappers'], operator.num_mappers)
        self.assertEqual(self._config['split_by'], operator.split_by)
        self.assertEqual(self._config['input_null_string'],
                         operator.input_null_string)
        self.assertEqual(self._config['input_null_non_string'],
                         operator.input_null_non_string)
        self.assertEqual(self._config['staging_table'], operator.staging_table)
        self.assertEqual(self._config['clear_staging_table'],
                         operator.clear_staging_table)
        self.assertEqual(self._config['batch'], operator.batch)
        self.assertEqual(self._config['relaxed_isolation'],
                         operator.relaxed_isolation)
        self.assertEqual(self._config['direct'], operator.direct)
        self.assertEqual(self._config['driver'], operator.driver)
        self.assertEqual(self._config['properties'], operator.properties)
        self.assertEqual(self._config['hcatalog_database'],
                         operator.hcatalog_database)
        self.assertEqual(self._config['hcatalog_table'],
                         operator.hcatalog_table)
        self.assertEqual(self._config['create_hcatalog_table'],
                         operator.create_hcatalog_table)
        self.assertEqual(self._config['extra_import_options'],
                         operator.extra_import_options)
        self.assertEqual(self._config['extra_export_options'],
                         operator.extra_export_options)

        # the following are meant to be more of examples
        SqoopOperator(
            task_id='sqoop_import_using_table',
            cmd_type='import',
            conn_id='sqoop_default',
            table='company',
            verbose=True,
            num_mappers=8,
            hcatalog_database='default',
            hcatalog_table='import_table_1',
            create_hcatalog_table=True,
            extra_import_options={
                'hcatalog-storage-stanza': "\"stored as orcfile\""
            },
            dag=self.dag,
        )

        SqoopOperator(
            task_id='sqoop_import_using_query',
            cmd_type='import',
            conn_id='sqoop_default',
            query='select name, age from company where $CONDITIONS',
            split_by='age',
            # the mappers will pass in values to the $CONDITIONS based on the field you select to split by
            verbose=True,
            num_mappers=None,
            hcatalog_database='default',
            hcatalog_table='import_table_2',
            create_hcatalog_table=True,
            extra_import_options={
                'hcatalog-storage-stanza': "\"stored as orcfile\""
            },
            dag=self.dag,
        )

        SqoopOperator(
            task_id='sqoop_import_with_partition',
            cmd_type='import',
            conn_id='sqoop_default',
            table='company',
            verbose=True,
            num_mappers=None,
            hcatalog_database='default',
            hcatalog_table='import_table_3',
            create_hcatalog_table=True,
            extra_import_options={
                'hcatalog-storage-stanza': "\"stored as orcfile\"",
                'hive-partition-key': 'day',
                'hive-partition-value': '2017-10-18',
                'fetch-size': 1,
            },
            dag=self.dag,
        )

        SqoopOperator(
            task_id='sqoop_export_tablename',
            cmd_type='export',
            conn_id='sqoop_default',
            table='rbdms_export_table_1',
            verbose=True,
            num_mappers=None,
            hcatalog_database='default',
            hcatalog_table='hive_export_table_1',
            extra_export_options=None,
            dag=self.dag,
        )

        SqoopOperator(
            task_id='sqoop_export_tablepath',
            cmd_type='export',
            conn_id='sqoop_default',
            table='rbdms_export_table_2',
            export_dir='/user/hive/warehouse/export_table_2',
            direct=True,  # speeds up for data transfer
            verbose=True,
            num_mappers=None,
            extra_export_options=None,
            dag=self.dag,
        )