Esempio n. 1
0
    def execute(self, context):
        # use the super to list all files in an Azure Data Lake path
        files = super(AdlsToGoogleCloudStorageOperator, self).execute(context)
        g_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to)

        if not self.replace:
            # if we are not replacing -> list all files in the ADLS path
            # and only keep those files which are present in
            # ADLS and not in Google Cloud Storage
            bucket_name, prefix = _parse_gcs_url(self.dest_gcs)
            existing_files = g_hook.list(bucket=bucket_name, prefix=prefix)
            files = set(files) - set(existing_files)

        if files:
            hook = AzureDataLakeHook(
                azure_data_lake_conn_id=self.azure_data_lake_conn_id
            )

            for obj in files:
                with NamedTemporaryFile(mode='wb', delete=True) as f:
                    hook.download_file(local_path=f.name, remote_path=obj)
                    f.flush()
                    dest_gcs_bucket, dest_gcs_prefix = _parse_gcs_url(self.dest_gcs)
                    dest_path = os.path.join(dest_gcs_prefix, obj)
                    self.log.info("Saving file to %s", dest_path)

                    g_hook.upload(bucket=dest_gcs_bucket, object=dest_path, filename=f.name)

            self.log.info("All done, uploaded %d files to GCS", len(files))
        else:
            self.log.info("In sync, no files needed to be uploaded to GCS")

        return files
    def execute(self, context):

        hook = AzureDataLakeHook(
            azure_data_lake_conn_id=self.azure_data_lake_conn_id)

        self.log.info('Getting list of ADLS files in path: %s', self.path)

        return hook.list(path=self.path)
    def poke(self, context):
        hook = AzureDataLakeHook(
            azure_data_lake_conn_id=self.azure_data_lake_conn_id)
        adls_conn = hook.get_conn()
        self.log.info('Poking for glob path: %s in ADLS://%s', self.glob_path,
                      adls_conn.kwargs['store_name'])

        return hook.check_for_file(self.glob_path)
    def execute(self, context):

        hook = AzureDataLakeHook(
            azure_data_lake_conn_id=self.azure_data_lake_conn_id
        )

        self.log.info('Getting list of ADLS files in path: %s', self.path)

        return hook.list(path=self.path)
Esempio n. 5
0
 def test_download_file(self, mock_lib, mock_downloader):
     from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
     hook = AzureDataLakeHook(azure_data_lake_conn_id='adl_test_key')
     hook.download_file(local_path='test_adl_hook.py',
                        remote_path='/test_adl_hook.py',
                        nthreads=64, overwrite=True,
                        buffersize=4194304, blocksize=4194304)
     mock_downloader.assert_called_once_with(hook.connection,
                                             lpath='test_adl_hook.py',
                                             rpath='/test_adl_hook.py',
                                             nthreads=64, overwrite=True,
                                             buffersize=4194304, blocksize=4194304)
Esempio n. 6
0
 def test_conn(self, mock_lib):
     from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
     from azure.datalake.store import core
     hook = AzureDataLakeHook(azure_data_lake_conn_id='adl_test_key')
     self.assertEqual(hook.conn_id, 'adl_test_key')
     self.assertIsInstance(hook.connection, core.AzureDLFileSystem)
     assert mock_lib.auth.called
    def execute(self, context):
        oracle_hook = OracleHook(oracle_conn_id=self.oracle_conn_id)
        azure_data_lake_hook = AzureDataLakeHook(
            azure_data_lake_conn_id=self.azure_data_lake_conn_id)

        self.log.info("Dumping Oracle query results to local file")
        conn = oracle_hook.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql, self.sql_params)

        with TemporaryDirectory(prefix='airflow_oracle_to_azure_op_') as temp:
            self._write_temp_file(cursor, os.path.join(temp, self.filename))
            self.log.info("Uploading local file to Azure Data Lake")
            azure_data_lake_hook.upload_file(
                os.path.join(temp, self.filename),
                os.path.join(self.azure_data_lake_path, self.filename))
        cursor.close()
        conn.close()
    def execute(self, context):
        oracle_hook = OracleHook(oracle_conn_id=self.oracle_conn_id)
        azure_data_lake_hook = AzureDataLakeHook(
            azure_data_lake_conn_id=self.azure_data_lake_conn_id)

        self.log.info("Dumping Oracle query results to local file")
        conn = oracle_hook.get_conn()
        cursor = conn.cursor()
        cursor.execute(self.sql, self.sql_params)

        with TemporaryDirectory(prefix='airflow_oracle_to_azure_op_') as temp:
            self._write_temp_file(cursor, os.path.join(temp, self.filename))
            self.log.info("Uploading local file to Azure Data Lake")
            azure_data_lake_hook.upload_file(os.path.join(temp, self.filename),
                                             os.path.join(self.azure_data_lake_path,
                                                          self.filename))
        cursor.close()
        conn.close()
Esempio n. 9
0
    def execute(self, context):
        source_hook = WasbHook(wasb_conn_id=self.azure_blob_conn_id)

        # Assumption: there is sufficient disk space to download the blob in question
        with NamedTemporaryFile(mode='wb', delete=True) as f:
            source_hook.get_file(file_path=f.name,
                                 container_name=self.src_blob_container,
                                 blob_name=self.src_blob)
            f.flush()
            self.log.info("Saving file to %s", f.name)

            if self.adls_gen == 1:
                self.log.info("Uploading to ADLS Gen 1")
                adls_hook = AzureDataLakeHook(
                    azure_data_lake_conn_id=self.azure_data_lake_conn_id)
                adls_hook.upload_file(local_path=f.name, remote_path=f.name)
            else:
                self.log.info("Uploading to ADLS Gen 2")
                adls_hook = WasbHook(wasb_conn_id=self.azure_data_lake_conn_id)
                adls_hook.load_file(f.name,
                                    container_name=self.dest_adls_container,
                                    blob_name=self.dest_adls)

        self.log.info("All done, uploaded files to Azure Data Lake Store")
Esempio n. 10
0
 def get_hook(self):
     if self.conn_type == 'mysql':
         from airflow.hooks.mysql_hook import MySqlHook
         return MySqlHook(mysql_conn_id=self.conn_id)
     elif self.conn_type == 'google_cloud_platform':
         from airflow.gcp.hooks.bigquery import BigQueryHook
         return BigQueryHook(bigquery_conn_id=self.conn_id)
     elif self.conn_type == 'postgres':
         from airflow.hooks.postgres_hook import PostgresHook
         return PostgresHook(postgres_conn_id=self.conn_id)
     elif self.conn_type == 'pig_cli':
         from airflow.hooks.pig_hook import PigCliHook
         return PigCliHook(pig_cli_conn_id=self.conn_id)
     elif self.conn_type == 'hive_cli':
         from airflow.hooks.hive_hooks import HiveCliHook
         return HiveCliHook(hive_cli_conn_id=self.conn_id)
     elif self.conn_type == 'presto':
         from airflow.hooks.presto_hook import PrestoHook
         return PrestoHook(presto_conn_id=self.conn_id)
     elif self.conn_type == 'hiveserver2':
         from airflow.hooks.hive_hooks import HiveServer2Hook
         return HiveServer2Hook(hiveserver2_conn_id=self.conn_id)
     elif self.conn_type == 'sqlite':
         from airflow.hooks.sqlite_hook import SqliteHook
         return SqliteHook(sqlite_conn_id=self.conn_id)
     elif self.conn_type == 'jdbc':
         from airflow.hooks.jdbc_hook import JdbcHook
         return JdbcHook(jdbc_conn_id=self.conn_id)
     elif self.conn_type == 'mssql':
         from airflow.hooks.mssql_hook import MsSqlHook
         return MsSqlHook(mssql_conn_id=self.conn_id)
     elif self.conn_type == 'oracle':
         from airflow.hooks.oracle_hook import OracleHook
         return OracleHook(oracle_conn_id=self.conn_id)
     elif self.conn_type == 'vertica':
         from airflow.contrib.hooks.vertica_hook import VerticaHook
         return VerticaHook(vertica_conn_id=self.conn_id)
     elif self.conn_type == 'cloudant':
         from airflow.contrib.hooks.cloudant_hook import CloudantHook
         return CloudantHook(cloudant_conn_id=self.conn_id)
     elif self.conn_type == 'jira':
         from airflow.contrib.hooks.jira_hook import JiraHook
         return JiraHook(jira_conn_id=self.conn_id)
     elif self.conn_type == 'redis':
         from airflow.contrib.hooks.redis_hook import RedisHook
         return RedisHook(redis_conn_id=self.conn_id)
     elif self.conn_type == 'wasb':
         from airflow.contrib.hooks.wasb_hook import WasbHook
         return WasbHook(wasb_conn_id=self.conn_id)
     elif self.conn_type == 'docker':
         from airflow.hooks.docker_hook import DockerHook
         return DockerHook(docker_conn_id=self.conn_id)
     elif self.conn_type == 'azure_data_lake':
         from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
         return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id)
     elif self.conn_type == 'azure_cosmos':
         from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook
         return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id)
     elif self.conn_type == 'cassandra':
         from airflow.contrib.hooks.cassandra_hook import CassandraHook
         return CassandraHook(cassandra_conn_id=self.conn_id)
     elif self.conn_type == 'mongo':
         from airflow.contrib.hooks.mongo_hook import MongoHook
         return MongoHook(conn_id=self.conn_id)
     elif self.conn_type == 'gcpcloudsql':
         from airflow.gcp.hooks.cloud_sql import CloudSqlDatabaseHook
         return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id)
     elif self.conn_type == 'grpc':
         from airflow.contrib.hooks.grpc_hook import GrpcHook
         return GrpcHook(grpc_conn_id=self.conn_id)
     raise AirflowException("Unknown hook type {}".format(self.conn_type))
Esempio n. 11
0
 def test_check_for_blob(self, mock_lib, mock_filesystem):
     from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
     hook = AzureDataLakeHook(azure_data_lake_conn_id='adl_test_key')
     hook.check_for_file('file_path')
     mock_filesystem.glob.called
Esempio n. 12
0
 def test_list_walk(self, mock_lib, mock_fs):
     from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
     hook = AzureDataLakeHook(azure_data_lake_conn_id='adl_test_key')
     hook.list('file_path/some_folder/')
     mock_fs.return_value.walk.assert_called_with('file_path/some_folder/')
 def test_list_glob(self, mock_lib, mock_fs):
     from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
     hook = AzureDataLakeHook(azure_data_lake_conn_id='adl_test_key')
     hook.list('file_path/*')
     mock_fs.return_value.glob.assert_called_with('file_path/*')
 def test_check_for_blob(self, mock_lib, mock_filesystem):
     from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
     hook = AzureDataLakeHook(azure_data_lake_conn_id='adl_test_key')
     hook.check_for_file('file_path')
     mock_filesystem.glob.called
 def test_list_glob(self, mock_lib, mock_fs):
     from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook
     hook = AzureDataLakeHook(azure_data_lake_conn_id='adl_test_key')
     hook.list('file_path/*')
     mock_fs.return_value.glob.assert_called_once_with('file_path/*')