Beispiel #1
0
    def execute(self, context: "Context") -> None:
        oracle_hook = OracleHook(oracle_conn_id=self.oracle_conn_id)
        azure_data_lake_hook = AzureDataLakeHook(azure_data_lake_conn_id=self.azure_data_lake_conn_id)

        self.log.info("Dumping Oracle query results to local file")
        conn = oracle_hook.get_conn()
        cursor = conn.cursor()  # type: ignore[attr-defined]
        cursor.execute(self.sql, self.sql_params)

        with TemporaryDirectory(prefix='airflow_oracle_to_azure_op_') as temp:
            self._write_temp_file(cursor, os.path.join(temp, self.filename))
            self.log.info("Uploading local file to Azure Data Lake")
            azure_data_lake_hook.upload_file(
                os.path.join(temp, self.filename), os.path.join(self.azure_data_lake_path, self.filename)
            )
        cursor.close()
        conn.close()  # type: ignore[attr-defined]
Beispiel #2
0
 def execute(self, context: "Context") -> None:
     if '**' in self.local_path:
         raise AirflowException(
             "Recursive glob patterns using `**` are not supported")
     if not self.extra_upload_options:
         self.extra_upload_options = {}
     hook = AzureDataLakeHook(
         azure_data_lake_conn_id=self.azure_data_lake_conn_id)
     self.log.info('Uploading %s to %s', self.local_path, self.remote_path)
     return hook.upload_file(
         local_path=self.local_path,
         remote_path=self.remote_path,
         nthreads=self.nthreads,
         overwrite=self.overwrite,
         buffersize=self.buffersize,
         blocksize=self.blocksize,
         **self.extra_upload_options,
     )
Beispiel #3
0
    def execute(self, context: 'Context'):
        # use the super to list all files in an Azure Data Lake path
        files = super().execute(context)
        g_hook = GCSHook(
            gcp_conn_id=self.gcp_conn_id,
            delegate_to=self.delegate_to,
            impersonation_chain=self.google_impersonation_chain,
        )

        if not self.replace:
            # if we are not replacing -> list all files in the ADLS path
            # and only keep those files which are present in
            # ADLS and not in Google Cloud Storage
            bucket_name, prefix = _parse_gcs_url(self.dest_gcs)
            existing_files = g_hook.list(bucket_name=bucket_name,
                                         prefix=prefix)
            files = list(set(files) - set(existing_files))

        if files:
            hook = AzureDataLakeHook(
                azure_data_lake_conn_id=self.azure_data_lake_conn_id)

            for obj in files:
                with NamedTemporaryFile(mode='wb', delete=True) as f:
                    hook.download_file(local_path=f.name, remote_path=obj)
                    f.flush()
                    dest_gcs_bucket, dest_gcs_prefix = _parse_gcs_url(
                        self.dest_gcs)
                    dest_path = os.path.join(dest_gcs_prefix, obj)
                    self.log.info("Saving file to %s", dest_path)

                    g_hook.upload(bucket_name=dest_gcs_bucket,
                                  object_name=dest_path,
                                  filename=f.name,
                                  gzip=self.gzip)

            self.log.info("All done, uploaded %d files to GCS", len(files))
        else:
            self.log.info("In sync, no files needed to be uploaded to GCS")

        return files
Beispiel #4
0
 def execute(self, context: dict) -> Any:
     hook = AzureDataLakeHook(
         azure_data_lake_conn_id=self.azure_data_lake_conn_id)
     return hook.remove(path=self.path,
                        recursive=self.recursive,
                        ignore_not_found=self.ignore_not_found)
Beispiel #5
0
 def execute(self, context: dict) -> list:
     hook = AzureDataLakeHook(
         azure_data_lake_conn_id=self.azure_data_lake_conn_id)
     self.log.info('Getting list of ADLS files in path: %s', self.path)
     return hook.list(path=self.path)