def execute(self, context): # use the super to list all files in an Azure Data Lake path files = super(AdlsToGoogleCloudStorageOperator, self).execute(context) g_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) if not self.replace: # if we are not replacing -> list all files in the ADLS path # and only keep those files which are present in # ADLS and not in Google Cloud Storage bucket_name, prefix = _parse_gcs_url(self.dest_gcs) existing_files = g_hook.list(bucket=bucket_name, prefix=prefix) files = set(files) - set(existing_files) if files: hook = AzureDataLakeHook( azure_data_lake_conn_id=self.azure_data_lake_conn_id ) for obj in files: with NamedTemporaryFile(mode='wb', delete=True) as f: hook.download_file(local_path=f.name, remote_path=obj) f.flush() dest_gcs_bucket, dest_gcs_prefix = _parse_gcs_url(self.dest_gcs) dest_path = os.path.join(dest_gcs_prefix, obj) self.log.info("Saving file to %s", dest_path) g_hook.upload(bucket=dest_gcs_bucket, object=dest_path, filename=f.name) self.log.info("All done, uploaded %d files to GCS", len(files)) else: self.log.info("In sync, no files needed to be uploaded to GCS") return files
def execute(self, context): hook = AzureDataLakeHook( azure_data_lake_conn_id=self.azure_data_lake_conn_id) self.log.info('Getting list of ADLS files in path: %s', self.path) return hook.list(path=self.path)
def poke(self, context): hook = AzureDataLakeHook( azure_data_lake_conn_id=self.azure_data_lake_conn_id) adls_conn = hook.get_conn() self.log.info('Poking for glob path: %s in ADLS://%s', self.glob_path, adls_conn.kwargs['store_name']) return hook.check_for_file(self.glob_path)
def execute(self, context): hook = AzureDataLakeHook( azure_data_lake_conn_id=self.azure_data_lake_conn_id ) self.log.info('Getting list of ADLS files in path: %s', self.path) return hook.list(path=self.path)
def test_download_file(self, mock_lib, mock_downloader): from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook hook = AzureDataLakeHook(azure_data_lake_conn_id='adl_test_key') hook.download_file(local_path='test_adl_hook.py', remote_path='/test_adl_hook.py', nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304) mock_downloader.assert_called_once_with(hook.connection, lpath='test_adl_hook.py', rpath='/test_adl_hook.py', nthreads=64, overwrite=True, buffersize=4194304, blocksize=4194304)
def test_conn(self, mock_lib): from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook from azure.datalake.store import core hook = AzureDataLakeHook(azure_data_lake_conn_id='adl_test_key') self.assertEqual(hook.conn_id, 'adl_test_key') self.assertIsInstance(hook.connection, core.AzureDLFileSystem) assert mock_lib.auth.called
def execute(self, context): oracle_hook = OracleHook(oracle_conn_id=self.oracle_conn_id) azure_data_lake_hook = AzureDataLakeHook( azure_data_lake_conn_id=self.azure_data_lake_conn_id) self.log.info("Dumping Oracle query results to local file") conn = oracle_hook.get_conn() cursor = conn.cursor() cursor.execute(self.sql, self.sql_params) with TemporaryDirectory(prefix='airflow_oracle_to_azure_op_') as temp: self._write_temp_file(cursor, os.path.join(temp, self.filename)) self.log.info("Uploading local file to Azure Data Lake") azure_data_lake_hook.upload_file( os.path.join(temp, self.filename), os.path.join(self.azure_data_lake_path, self.filename)) cursor.close() conn.close()
def execute(self, context): oracle_hook = OracleHook(oracle_conn_id=self.oracle_conn_id) azure_data_lake_hook = AzureDataLakeHook( azure_data_lake_conn_id=self.azure_data_lake_conn_id) self.log.info("Dumping Oracle query results to local file") conn = oracle_hook.get_conn() cursor = conn.cursor() cursor.execute(self.sql, self.sql_params) with TemporaryDirectory(prefix='airflow_oracle_to_azure_op_') as temp: self._write_temp_file(cursor, os.path.join(temp, self.filename)) self.log.info("Uploading local file to Azure Data Lake") azure_data_lake_hook.upload_file(os.path.join(temp, self.filename), os.path.join(self.azure_data_lake_path, self.filename)) cursor.close() conn.close()
def execute(self, context): source_hook = WasbHook(wasb_conn_id=self.azure_blob_conn_id) # Assumption: there is sufficient disk space to download the blob in question with NamedTemporaryFile(mode='wb', delete=True) as f: source_hook.get_file(file_path=f.name, container_name=self.src_blob_container, blob_name=self.src_blob) f.flush() self.log.info("Saving file to %s", f.name) if self.adls_gen == 1: self.log.info("Uploading to ADLS Gen 1") adls_hook = AzureDataLakeHook( azure_data_lake_conn_id=self.azure_data_lake_conn_id) adls_hook.upload_file(local_path=f.name, remote_path=f.name) else: self.log.info("Uploading to ADLS Gen 2") adls_hook = WasbHook(wasb_conn_id=self.azure_data_lake_conn_id) adls_hook.load_file(f.name, container_name=self.dest_adls_container, blob_name=self.dest_adls) self.log.info("All done, uploaded files to Azure Data Lake Store")
def get_hook(self): if self.conn_type == 'mysql': from airflow.hooks.mysql_hook import MySqlHook return MySqlHook(mysql_conn_id=self.conn_id) elif self.conn_type == 'google_cloud_platform': from airflow.gcp.hooks.bigquery import BigQueryHook return BigQueryHook(bigquery_conn_id=self.conn_id) elif self.conn_type == 'postgres': from airflow.hooks.postgres_hook import PostgresHook return PostgresHook(postgres_conn_id=self.conn_id) elif self.conn_type == 'pig_cli': from airflow.hooks.pig_hook import PigCliHook return PigCliHook(pig_cli_conn_id=self.conn_id) elif self.conn_type == 'hive_cli': from airflow.hooks.hive_hooks import HiveCliHook return HiveCliHook(hive_cli_conn_id=self.conn_id) elif self.conn_type == 'presto': from airflow.hooks.presto_hook import PrestoHook return PrestoHook(presto_conn_id=self.conn_id) elif self.conn_type == 'hiveserver2': from airflow.hooks.hive_hooks import HiveServer2Hook return HiveServer2Hook(hiveserver2_conn_id=self.conn_id) elif self.conn_type == 'sqlite': from airflow.hooks.sqlite_hook import SqliteHook return SqliteHook(sqlite_conn_id=self.conn_id) elif self.conn_type == 'jdbc': from airflow.hooks.jdbc_hook import JdbcHook return JdbcHook(jdbc_conn_id=self.conn_id) elif self.conn_type == 'mssql': from airflow.hooks.mssql_hook import MsSqlHook return MsSqlHook(mssql_conn_id=self.conn_id) elif self.conn_type == 'oracle': from airflow.hooks.oracle_hook import OracleHook return OracleHook(oracle_conn_id=self.conn_id) elif self.conn_type == 'vertica': from airflow.contrib.hooks.vertica_hook import VerticaHook return VerticaHook(vertica_conn_id=self.conn_id) elif self.conn_type == 'cloudant': from airflow.contrib.hooks.cloudant_hook import CloudantHook return CloudantHook(cloudant_conn_id=self.conn_id) elif self.conn_type == 'jira': from airflow.contrib.hooks.jira_hook import JiraHook return JiraHook(jira_conn_id=self.conn_id) elif self.conn_type == 'redis': from airflow.contrib.hooks.redis_hook import RedisHook return RedisHook(redis_conn_id=self.conn_id) elif self.conn_type == 'wasb': from airflow.contrib.hooks.wasb_hook import WasbHook return WasbHook(wasb_conn_id=self.conn_id) elif self.conn_type == 'docker': from airflow.hooks.docker_hook import DockerHook return DockerHook(docker_conn_id=self.conn_id) elif self.conn_type == 'azure_data_lake': from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id) elif self.conn_type == 'azure_cosmos': from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id) elif self.conn_type == 'cassandra': from airflow.contrib.hooks.cassandra_hook import CassandraHook return CassandraHook(cassandra_conn_id=self.conn_id) elif self.conn_type == 'mongo': from airflow.contrib.hooks.mongo_hook import MongoHook return MongoHook(conn_id=self.conn_id) elif self.conn_type == 'gcpcloudsql': from airflow.gcp.hooks.cloud_sql import CloudSqlDatabaseHook return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id) elif self.conn_type == 'grpc': from airflow.contrib.hooks.grpc_hook import GrpcHook return GrpcHook(grpc_conn_id=self.conn_id) raise AirflowException("Unknown hook type {}".format(self.conn_type))
def test_check_for_blob(self, mock_lib, mock_filesystem): from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook hook = AzureDataLakeHook(azure_data_lake_conn_id='adl_test_key') hook.check_for_file('file_path') mock_filesystem.glob.called
def test_list_walk(self, mock_lib, mock_fs): from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook hook = AzureDataLakeHook(azure_data_lake_conn_id='adl_test_key') hook.list('file_path/some_folder/') mock_fs.return_value.walk.assert_called_with('file_path/some_folder/')
def test_list_glob(self, mock_lib, mock_fs): from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook hook = AzureDataLakeHook(azure_data_lake_conn_id='adl_test_key') hook.list('file_path/*') mock_fs.return_value.glob.assert_called_with('file_path/*')
def test_list_glob(self, mock_lib, mock_fs): from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook hook = AzureDataLakeHook(azure_data_lake_conn_id='adl_test_key') hook.list('file_path/*') mock_fs.return_value.glob.assert_called_once_with('file_path/*')