Beispiel #1
0
    def execute(self, context):
        # use the super to list all files in an Azure Data Lake path
        files = super(AdlsToGoogleCloudStorageOperator, self).execute(context)
        g_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
            delegate_to=self.delegate_to)

        if not self.replace:
            # if we are not replacing -> list all files in the ADLS path
            # and only keep those files which are present in
            # ADLS and not in Google Cloud Storage
            bucket_name, prefix = _parse_gcs_url(self.dest_gcs)
            existing_files = g_hook.list(bucket=bucket_name, prefix=prefix)
            files = set(files) - set(existing_files)

        if files:
            hook = AzureDataLakeHook(
                azure_data_lake_conn_id=self.azure_data_lake_conn_id
            )

            for obj in files:
                with NamedTemporaryFile(mode='wb', delete=True) as f:
                    hook.download_file(local_path=f.name, remote_path=obj)
                    f.flush()
                    dest_gcs_bucket, dest_gcs_prefix = _parse_gcs_url(self.dest_gcs)
                    dest_path = os.path.join(dest_gcs_prefix, obj)
                    self.log.info("Saving file to %s", dest_path)

                    g_hook.upload(bucket=dest_gcs_bucket, object=dest_path, filename=f.name)

            self.log.info("All done, uploaded %d files to GCS", len(files))
        else:
            self.log.info("In sync, no files needed to be uploaded to GCS")

        return files
Beispiel #2
0
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and self.gcs_schema_object:

            gcs_bucket, gcs_object = _parse_gcs_url(self.gcs_schema_object)

            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(gcs_hook.download(
                gcs_bucket,
                gcs_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        cursor.create_empty_table(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            table_id=self.table_id,
            schema_fields=schema_fields,
            time_partitioning=self.time_partitioning
        )
    def execute(self, context):
        bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id,
                               delegate_to=self.delegate_to)

        if not self.schema_fields and self.gcs_schema_object:

            gcs_bucket, gcs_object = _parse_gcs_url(self.gcs_schema_object)

            gcs_hook = GoogleCloudStorageHook(
                google_cloud_storage_conn_id=self.google_cloud_storage_conn_id,
                delegate_to=self.delegate_to)
            schema_fields = json.loads(gcs_hook.download(
                gcs_bucket,
                gcs_object).decode("utf-8"))
        else:
            schema_fields = self.schema_fields

        conn = bq_hook.get_conn()
        cursor = conn.cursor()

        cursor.create_empty_table(
            project_id=self.project_id,
            dataset_id=self.dataset_id,
            table_id=self.table_id,
            schema_fields=schema_fields,
            time_partitioning=self.time_partitioning,
            labels=self.labels
        )
Beispiel #4
0
    def test_parse_gcs_url(self):
        """
        Test GCS url parsing
        """

        self.assertEqual(gcs_hook._parse_gcs_url('gs://bucket/path/to/blob'),
                         ('bucket', 'path/to/blob'))

        # invalid URI
        self.assertRaises(AirflowException, gcs_hook._parse_gcs_url,
                          'gs:/bucket/path/to/blob')

        # trailing slash
        self.assertEqual(gcs_hook._parse_gcs_url('gs://bucket/path/to/blob/'),
                         ('bucket', 'path/to/blob/'))

        # bucket only
        self.assertEqual(gcs_hook._parse_gcs_url('gs://bucket/'),
                         ('bucket', ''))
    def test_parse_gcs_url(self):
        """
        Test GCS url parsing
        """

        self.assertEqual(
            gcs_hook._parse_gcs_url('gs://bucket/path/to/blob'),
            ('bucket', 'path/to/blob'))

        # invalid URI
        self.assertRaises(AirflowException, gcs_hook._parse_gcs_url,
                          'gs:/bucket/path/to/blob')

        # trailing slash
        self.assertEqual(
            gcs_hook._parse_gcs_url('gs://bucket/path/to/blob/'),
            ('bucket', 'path/to/blob/'))

        # bucket only
        self.assertEqual(
            gcs_hook._parse_gcs_url('gs://bucket/'), ('bucket', ''))
        def _assert_upload(bucket, object, filename):
            gcs_bucket, gcs_object_path = _parse_gcs_url(GCS_PATH)

            self.assertEqual(gcs_bucket, 'test')
            self.assertIn(object[len(gcs_object_path):], MOCK_FILES)
    def _gcs_object_is_directory(object):
        bucket, blob = _parse_gcs_url(object)

        return len(blob) == 0 or blob.endswith('/')
    def execute(self, context):
        # use the super method to list all the files in an S3 bucket/key
        files = super(S3ToGoogleCloudStorageOperator, self).execute(context)

        gcs_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.dest_gcs_conn_id,
            delegate_to=self.delegate_to)

        if not self.replace:
            # if we are not replacing -> list all files in the GCS bucket
            # and only keep those files which are present in
            # S3 and not in Google Cloud Storage
            bucket_name, object_prefix = _parse_gcs_url(self.dest_gcs)
            existing_files_prefixed = gcs_hook.list(bucket_name,
                                                    prefix=object_prefix)

            existing_files = []

            if existing_files_prefixed:
                # Remove the object prefix itself, an empty directory was found
                if object_prefix in existing_files_prefixed:
                    existing_files_prefixed.remove(object_prefix)

                # Remove the object prefix from all object string paths
                for f in existing_files_prefixed:
                    if f.startswith(object_prefix):
                        existing_files.append(f[len(object_prefix):])
                    else:
                        existing_files.append(f)

            files = list(set(files) - set(existing_files))
            if len(files) > 0:
                self.log.info('{0} files are going to be synced: {1}.'.format(
                    len(files), files))
            else:
                self.log.info(
                    'There are no new files to sync. Have a nice day!')

        if files:
            hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)

            for file in files:
                # GCS hook builds its own in-memory file so we have to create
                # and pass the path
                file_object = hook.get_key(file, self.bucket)
                with NamedTemporaryFile(mode='wb', delete=True) as f:
                    file_object.download_fileobj(f)
                    f.flush()

                    dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url(
                        self.dest_gcs)
                    # There will always be a '/' before file because it is
                    # enforced at instantiation time
                    dest_gcs_object = dest_gcs_object_prefix + file

                    # Sync is sequential and the hook already logs too much
                    # so skip this for now
                    # self.log.info(
                    #     'Saving file {0} from S3 bucket {1} in GCS bucket {2}'
                    #     ' as object {3}'.format(file, self.bucket,
                    #                             dest_gcs_bucket,
                    #                             dest_gcs_object))

                    gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, f.name)

            self.log.info(
                "All done, uploaded %d files to Google Cloud Storage",
                len(files))
        else:
            self.log.info(
                'In sync, no files needed to be uploaded to Google Cloud'
                'Storage')

        return files
    def _gcs_object_is_directory(self, object):
        bucket, blob = _parse_gcs_url(object)

        return blob.endswith('/')
        def _assert_upload(bucket, object, tmp_filename):
            gcs_bucket, gcs_object_path = _parse_gcs_url(GCS_PATH_PREFIX)

            self.assertEqual(gcs_bucket, bucket)
            self.assertIn(object[len(gcs_object_path):], MOCK_FILES)
        def _assert_upload(bucket, object, tmp_filename):
            gcs_bucket, gcs_object_path = _parse_gcs_url(GCS_PATH_PREFIX)

            self.assertEqual(gcs_bucket, bucket)
            self.assertIn(object[len(gcs_object_path):], MOCK_FILES)
    def _gcs_object_is_directory(object):
        bucket, blob = _parse_gcs_url(object)

        return len(blob) == 0 or blob.endswith('/')
    def execute(self, context):
        # use the super method to list all the files in an S3 bucket/key
        files = super().execute(context)

        gcs_hook = GoogleCloudStorageHook(
            google_cloud_storage_conn_id=self.dest_gcs_conn_id,
            delegate_to=self.delegate_to)

        if not self.replace:
            # if we are not replacing -> list all files in the GCS bucket
            # and only keep those files which are present in
            # S3 and not in Google Cloud Storage
            bucket_name, object_prefix = _parse_gcs_url(self.dest_gcs)
            existing_files_prefixed = gcs_hook.list(
                bucket_name, prefix=object_prefix)

            existing_files = []

            if existing_files_prefixed:
                # Remove the object prefix itself, an empty directory was found
                if object_prefix in existing_files_prefixed:
                    existing_files_prefixed.remove(object_prefix)

                # Remove the object prefix from all object string paths
                for f in existing_files_prefixed:
                    if f.startswith(object_prefix):
                        existing_files.append(f[len(object_prefix):])
                    else:
                        existing_files.append(f)

            files = list(set(files) - set(existing_files))
            if len(files) > 0:
                self.log.info(
                    '%s files are going to be synced: %s.', len(files), files
                )
            else:
                self.log.info(
                    'There are no new files to sync. Have a nice day!')

        if files:
            hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify)

            for file in files:
                # GCS hook builds its own in-memory file so we have to create
                # and pass the path
                file_object = hook.get_key(file, self.bucket)
                with NamedTemporaryFile(mode='wb', delete=True) as f:
                    file_object.download_fileobj(f)
                    f.flush()

                    dest_gcs_bucket, dest_gcs_object_prefix = _parse_gcs_url(
                        self.dest_gcs)
                    # There will always be a '/' before file because it is
                    # enforced at instantiation time
                    dest_gcs_object = dest_gcs_object_prefix + file

                    # Sync is sequential and the hook already logs too much
                    # so skip this for now
                    # self.log.info(
                    #     'Saving file {0} from S3 bucket {1} in GCS bucket {2}'
                    #     ' as object {3}'.format(file, self.bucket,
                    #                             dest_gcs_bucket,
                    #                             dest_gcs_object))

                    gcs_hook.upload(dest_gcs_bucket, dest_gcs_object, f.name)

            self.log.info(
                "All done, uploaded %d files to Google Cloud Storage",
                len(files))
        else:
            self.log.info(
                'In sync, no files needed to be uploaded to Google Cloud'
                'Storage')

        return files
Beispiel #14
0
        def _assert_upload(bucket_name, object_name, filename):  # pylint: disable=unused-argument
            gcs_bucket, gcs_object_path = _parse_gcs_url(GCS_PATH)

            self.assertEqual(gcs_bucket, 'test')
            self.assertIn(object_name[len(gcs_object_path):], MOCK_FILES)
        def _assert_upload(bucket, object, filename):
            gcs_bucket, gcs_object_path = _parse_gcs_url(GCS_PATH)

            self.assertEqual(gcs_bucket, 'test')
            self.assertIn(object[len(gcs_object_path):], MOCK_FILES)