Ejemplo n.º 1
0
    CloudDataCatalogGetTagTemplateOperator,
    CloudDataCatalogListTagsOperator,
    CloudDataCatalogLookupEntryOperator,
    CloudDataCatalogRenameTagTemplateFieldOperator,
    CloudDataCatalogSearchCatalogOperator,
    CloudDataCatalogUpdateEntryOperator,
    CloudDataCatalogUpdateTagOperator,
    CloudDataCatalogUpdateTagTemplateFieldOperator,
    CloudDataCatalogUpdateTagTemplateOperator,
)

TEST_PROJECT_ID: str = "example_id"
TEST_LOCATION: str = "en-west-3"
TEST_ENTRY_ID: str = "test-entry-id"
TEST_TAG_ID: str = "test-tag-id"
TEST_RETRY: Retry = Retry()
TEST_TIMEOUT: float = 0.5
TEST_METADATA: Sequence[Tuple[str, str]] = []
TEST_GCP_CONN_ID: str = "test-gcp-conn-id"
TEST_IMPERSONATION_CHAIN: Sequence[str] = ["ACCOUNT_1", "ACCOUNT_2", "ACCOUNT_3"]
TEST_ENTRY_GROUP_ID: str = "test-entry-group-id"
TEST_TAG_TEMPLATE_ID: str = "test-tag-template-id"
TEST_TAG_TEMPLATE_FIELD_ID: str = "test-tag-template-field-id"
TEST_TAG_TEMPLATE_NAME: str = "test-tag-template-field-name"
TEST_FORCE: bool = False
TEST_READ_MASK: Dict = {"fields": ["name"]}
TEST_RESOURCE: str = "test-resource"
TEST_OPTIONS_: Dict = {}
TEST_PAGE_SIZE: int = 50
TEST_LINKED_RESOURCE: str = "test-linked-resource"
TEST_SQL_RESOURCE: str = "test-sql-resource"
Ejemplo n.º 2
0
def upload_tiff_and_json_files(logger, filepaths_to_upload, bucket, stats, uncompressed_blob_prefix, extraction_path):
    google_retry = Retry(deadline=480, maximum=240)

    def on_google_retry_error(ex: Exception):
        logger.error("Exception when uploading blob to google cloud.")
        logger.exception(ex)

    def google_cloud_uploader():
        start = time.time()

        blob_name, filepath, content_type = filepaths_to_upload.get(timeout=30)
        while True:
            blob = bucket.blob(blob_name)
            try:
                google_retry(blob.upload_from_filename(filepath, content_type=content_type),
                             on_error=on_google_retry_error)
            except Exception as ex:
                logger.error(f"Uncaught exception when uploading blob to google cloud.")
                logger.exception(ex)
                filepaths_to_upload.put((blob.name, filepath, content_type))
                raise ex
            stats['num_files_uploaded'] += 1

            if stats['num_files_uploaded'] > stats['checkpoint']:
                elapsed = (time.time() - start) / 60
                logger.info(
                    f"Uploaded {stats['num_files_uploaded']} files in {elapsed} minutes, {stats['num_files_uploaded'] / elapsed} files per minute.")
                stats['checkpoint'] += 1000
            blob_name, filepath, content_type = filepaths_to_upload.get(timeout=5)

    def traverse_directory():
        for subdir_name in os.listdir(extraction_path):
            subdir_path = extraction_path + "/" + subdir_name
            if os.path.isdir(subdir_path):
                for filename in os.listdir(subdir_path):
                    if not filename.startswith("._"):
                        split = filename.rsplit(".")
                        if split[-1] == "tif" and split[-2].endswith(("B02", "B03", "B04")):
                            content_type = "image/tiff"
                            # multiple tiff files per subdirectory
                            blob_name: str = os.path.join(uncompressed_blob_prefix, "tiff", subdir_name, filename)

                            filepath = subdir_path + "/" + filename
                            filepaths_to_upload.put((blob_name, filepath, content_type))
                        elif split[-1] == "json":
                            # one json file per subdirectory
                            blob_name: str = os.path.join(uncompressed_blob_prefix, "json_metadata", filename)

                            filepath = subdir_path + "/" + filename
                            filepaths_to_upload.put((blob_name, filepath, content_type))

    num_workers = int(os.environ.get("NUM_WORKERS", 3))
    with ThreadPoolExecutor(max_workers=num_workers + 1) as executor:
        tasks: List[Future] = []
        for x in range(num_workers):
            tasks.append(executor.submit(google_cloud_uploader))
        tasks.append(executor.submit(traverse_directory))
        logger.info(f"Started {len(tasks)} worker tasks.")

        logger.info("Starting traverse_directory")
        for task in as_completed(tasks):
            if task.exception() is not None:
                if type(task.exception()) == Empty:
                    logger.info("Child thread completed")
                else:
                    logger.error("Child thread failed")
                    logger.exception(task.exception())

    logger.info("Ending job")
Ejemplo n.º 3
0
gcs_client = storage.Client()
bucket_name: str = os.environ.get("GCS_BUCKET_NAME")
disk_path: str = os.environ.get("DISK_PATH")
logger = logging.Logger(name='logger', level=logging.INFO)
handler = logging.StreamHandler(sys.stdout)
logger.addHandler(handler)


def on_google_retry_error(ex: Exception):
    logger.error("Exception when uploading blob to google cloud.")
    logger.exception(ex)


fs = gcsfs.GCSFileSystem(project='big_earth')

google_retry = Retry(deadline=480, maximum=240)

image_paths = queue.Queue()

stats = {
    "pixel_sum": 0,
    "num_images": 0,
}


def get_image_sum_from_gcs():
    image_path = image_paths.get(timeout=30)
    r = google_retry(fs.cat(image_path), on_error=on_google_retry_error)
    img = imageio.core.asarray(imageio.imread(r, 'TIFF'))
    stats['pixel_sum'] += img.sum()
    stats['num_images'] += 1
Ejemplo n.º 4
0
def test_list_documents_w_retry_timeout():
    from google.api_core.retry import Retry

    retry = Retry(predicate=object())
    timeout = 123.0
    _list_documents_helper(retry=retry, timeout=timeout)
Ejemplo n.º 5
0
    async def test_get_all_w_retry_timeout(self):
        from google.api_core.retry import Retry

        retry = Retry(predicate=object())
        timeout = 123.0
        await self._get_all_helper(retry=retry, timeout=timeout)
Ejemplo n.º 6
0
    def test_collections_w_retry_timeout(self):
        from google.api_core.retry import Retry

        retry = Retry(predicate=object())
        timeout = 123.0
        self._collections_helper(retry=retry, timeout=timeout)
Ejemplo n.º 7
0
 def test_execute_update_w_timeout_and_retry_params(self):
     self._execute_update_helper(retry=Retry(deadline=60), timeout=2.0)
Ejemplo n.º 8
0
def _transient_string_in_exception_message(exc):
    # type: (Exception) -> bool
    """Determines whether an exception's message contains a common message for transient errors.

    The exception's message containing one of these substrings is sufficient to determine that it is
    transient, but there can be transient exceptions whose messages do not contain these substrings.
    """
    return ('The job encountered an internal error during execution'
            in str(exc)
            or 'Retrying the job may solve the problem' in str(exc))


# Retry object for errors encountered in making API calls (executing jobs, etc.)
DEFAULT_RETRY_FOR_API_CALLS = Retry(
    # The predicate takes an exception and returns whether it is transient.
    predicate=lambda exc: (bq_retry.DEFAULT_RETRY._predicate(exc) or
                           _transient_string_in_exception_message(exc)),
    deadline=DEFAULT_TIMEOUT_SEC)

# Retry object for errors encountered while polling jobs in progress.
# See https://github.com/googleapis/google-cloud-python/issues/6301
DEFAULT_RETRY_FOR_ASYNC_JOBS = Retry(
    # The predicate takes an exception and returns whether it is transient.
    predicate=lambda exc: (polling.DEFAULT_RETRY._predicate(exc) or
                           _transient_string_in_exception_message(exc)),
    deadline=DEFAULT_TIMEOUT_SEC)


class BigqueryBaseClient(object):
    """Stores credentials and pointers to a BigQuery project.
    def test_get_w_document_ref_w_retry_timeout(self):
        from google.api_core.retry import Retry

        retry = Retry(predicate=object())
        timeout = 123.0
        self._get_w_document_ref_helper(retry=retry, timeout=timeout)
Ejemplo n.º 10
0
 def test_execute_update_w_retry_param(self):
     self._execute_update_helper(retry=Retry(deadline=60))
Ejemplo n.º 11
0
from google.cloud.bigtable.row import ConditionalRow
from google.cloud.bigtable.row import DirectRow
from google.cloud.bigtable.row_data import PartialRowsData
from grpc import StatusCode

# Maximum number of mutations in bulk (MutateRowsRequest message):
# (https://cloud.google.com/bigtable/docs/reference/data/rpc/
#  google.bigtable.v2#google.bigtable.v2.MutateRowRequest)
_MAX_BULK_MUTATIONS = 100000

DEFAULT_RETRY = Retry(
    predicate=if_exception_type((
        Aborted,
        DeadlineExceeded,
        ServiceUnavailable,
    ), ),
    initial=1.0,
    maximum=15.0,
    multiplier=2.0,
    deadline=120.0,  # 2 minutes
)
"""The default retry stategy to be used on retry-able errors.

Used by :meth:`~google.cloud.bigtable.table.Table.mutate_rows`.
"""


class TableMismatchError(ValueError):
    """Row from another table."""

Ejemplo n.º 12
0
async def test_asynctransaction_get_w_document_ref_w_retry_timeout():
    from google.api_core.retry import Retry

    retry = Retry(predicate=object())
    timeout = 123.0
    await _get_w_document_ref_helper(retry=retry, timeout=timeout)
Ejemplo n.º 13
0
from mediawords.util.log import create_logger
from mediawords.workflow.exceptions import McProgrammingError

from .config import GCAuthConfig
from .transcript import Transcript, UtteranceAlternative, Utterance
from .media_info import MediaFileInfoAudioStream

log = create_logger(__name__)

# Speech API sometimes throws:
#
#   google.api_core.exceptions.ServiceUnavailable: 503 failed to connect to all addresses
#
# so let it retry for 10 minutes or so.
_GOOGLE_API_RETRIES = Retry(initial=5, maximum=60, multiplier=2, deadline=60 * 10)
"""Google Cloud API's own retry policy."""


def submit_transcribe_operation(gs_uri: str,
                                episode_metadata: MediaFileInfoAudioStream,
                                bcp47_language_code: str,
                                gc_auth_config: Optional[GCAuthConfig] = None) -> str:
    """
    Submit a Speech API long running operation to transcribe a podcast episode.

    :param gs_uri: Google Cloud Storage URI to a transcoded episode.
    :param episode_metadata: Metadata derived from the episode while transcoding it.
    :param bcp47_language_code: Episode's BCP 47 language code guessed from story's title + description.
    :param gc_auth_config: Google Cloud authentication configuration instance.
    :return Google Speech API operation ID by which the transcription operation can be referred to.
Ejemplo n.º 14
0
# [START howto_operator_vision_detect_image_param]
DETECT_IMAGE = {"source": {"image_uri": GCP_VISION_ANNOTATE_IMAGE_URL}}
# [END howto_operator_vision_detect_image_param]

with models.DAG('example_gcp_vision_autogenerated_id',
                start_date=days_ago(1),
                schedule_interval=None) as dag_autogenerated_id:
    # ################################## #
    # ### Autogenerated IDs examples ### #
    # ################################## #

    # [START howto_operator_vision_product_set_create]
    product_set_create = CloudVisionCreateProductSetOperator(
        location=GCP_VISION_LOCATION,
        product_set=product_set,
        retry=Retry(maximum=10.0),
        timeout=5,
        task_id='product_set_create',
    )
    # [END howto_operator_vision_product_set_create]

    # [START howto_operator_vision_product_set_get]
    product_set_get = CloudVisionGetProductSetOperator(
        location=GCP_VISION_LOCATION,
        product_set_id="{{ task_instance.xcom_pull('product_set_create') }}",
        task_id='product_set_get',
    )
    # [END howto_operator_vision_product_set_get]

    # [START howto_operator_vision_product_set_update]
    product_set_update = CloudVisionUpdateProductSetOperator(
Ejemplo n.º 15
0
    def delete_table(self,
                     mode="staging",
                     bucket_name=None,
                     not_found_ok=False):
        """Deletes a table from storage, sends request in batches.

        Args:
            mode (str): Folder of which dataset to update [raw|staging|header|auxiliary_files|architecture]
                Folder of which dataset to update. Defaults to "staging".

            bucket_name (str):
                The bucket name from which to delete the table. If None, defaults to the bucket initialized when instantiating the Storage object.
                (You can check it with the Storage().bucket property)

            not_found_ok (bool): Optional.
                What to do if table not found

        """

        prefix = f"{mode}/{self.dataset_id}/{self.table_id}/"

        if bucket_name is not None:

            table_blobs = list(self.client["storage_staging"].bucket(
                f"{bucket_name}").list_blobs(prefix=prefix))

        else:

            table_blobs = list(self.bucket.list_blobs(prefix=prefix))

        if table_blobs == []:
            if not_found_ok:
                return
            else:
                raise FileNotFoundError(
                    f"Could not find the requested table {self.dataset_id}.{self.table_id}"
                )

        else:
            # Divides table_blobs list for maximum batch request size
            table_blobs_chunks = [
                table_blobs[i:i + 999] for i in range(0, len(table_blobs), 999)
            ]

            for i, source_table in enumerate(
                    tqdm(table_blobs_chunks, desc="Delete Table Chunk")):
                counter = 0
                while counter < 100:
                    try:
                        with self.client["storage_staging"].batch():
                            for blob in source_table:
                                blob.delete(retry=Retry(
                                    predicate=_is_retryable))
                        break
                    except Exception as e:
                        print(
                            f"Delete Table Chunk {i} | Attempt {counter}: delete operation starts again in 5 seconds...",
                        )
                        time.sleep(5)
                        counter += 1
                        traceback.print_exc(file=sys.stderr)
Ejemplo n.º 16
0
# pylint: enable=ungrouped-imports


SPANNER_DATA_SCOPE = "https://www.googleapis.com/auth/spanner.data"


_DATABASE_NAME_RE = re.compile(
    r"^projects/(?P<project>[^/]+)/"
    r"instances/(?P<instance_id>[a-z][-a-z0-9]*)/"
    r"databases/(?P<database_id>[a-z][a-z0-9_\-]*[a-z0-9])$"
)

_DATABASE_METADATA_FILTER = "name:{0}/operations/"

DEFAULT_RETRY_BACKOFF = Retry(initial=0.02, maximum=32, multiplier=1.3)


class Database(object):
    """Representation of a Cloud Spanner Database.

    We can use a :class:`Database` to:

    * :meth:`create` the database
    * :meth:`reload` the database
    * :meth:`update` the database
    * :meth:`drop` the database

    :type database_id: str
    :param database_id: The ID of the database.
Ejemplo n.º 17
0
    def copy_table(
        self,
        source_bucket_name="basedosdados",
        destination_bucket_name=None,
        mode="staging",
    ):
        """Copies table from a source bucket to your bucket, sends request in batches.

        Args:
            source_bucket_name (str):
                The bucket name from which to copy data. You can change it
                to copy from other external bucket.

            destination_bucket_name (str): Optional
                The bucket name where data will be copied to.
                If None, defaults to the bucket initialized when instantiating the Storage object (You can check it with the
                Storage().bucket property)

            mode (str): Folder of which dataset to update [raw|staging|header|auxiliary_files|architecture]
                Folder of which dataset to update. Defaults to "staging".
        """

        source_table_ref = list(self.client["storage_staging"].bucket(
            source_bucket_name).list_blobs(
                prefix=f"{mode}/{self.dataset_id}/{self.table_id}/"))

        if source_table_ref == []:
            raise FileNotFoundError(
                f"Could not find the requested table {self.dataset_id}.{self.table_id}"
            )

        if destination_bucket_name is None:

            destination_bucket = self.bucket

        else:

            destination_bucket = self.client["storage_staging"].bucket(
                destination_bucket_name)

        # Divides source_table_ref list for maximum batch request size
        source_table_ref_chunks = [
            source_table_ref[i:i + 999]
            for i in range(0, len(source_table_ref), 999)
        ]

        for i, source_table in enumerate(
                tqdm(source_table_ref_chunks, desc="Copy Table Chunk")):
            counter = 0
            while counter < 100:
                try:
                    with self.client["storage_staging"].batch():
                        for blob in source_table:
                            self.bucket.copy_blob(
                                blob,
                                destination_bucket=destination_bucket,
                                retry=Retry(predicate=_is_retryable),
                            )
                    break
                except Exception as e:
                    print(
                        f"Copy Table Chunk {i} | Attempt {counter}: copy operation starts again in 5 seconds...",
                    )
                    counter += 1
                    time.sleep(5)
                    traceback.print_exc(file=sys.stderr)
Ejemplo n.º 18
0
async def test_asynccollectionreference_list_documents_w_retry_timeout():
    from google.api_core.retry import Retry

    retry = Retry(predicate=object())
    timeout = 123.0
    await _list_documents_helper(retry=retry, timeout=timeout)
Ejemplo n.º 19
0
# [START howto_operator_vision_detect_image_param]
DETECT_IMAGE = {"source": {"image_uri": GCP_VISION_ANNOTATE_IMAGE_URL}}
# [END howto_operator_vision_detect_image_param]

with models.DAG(
    'example_gcp_vision_autogenerated_id', default_args=default_args, schedule_interval=None
) as dag_autogenerated_id:
    # ################################## #
    # ### Autogenerated IDs examples ### #
    # ################################## #

    # [START howto_operator_vision_product_set_create]
    product_set_create = CloudVisionCreateProductSetOperator(
        location=GCP_VISION_LOCATION,
        product_set=product_set,
        retry=Retry(maximum=10.0),
        timeout=5,
        task_id='product_set_create',
    )
    # [END howto_operator_vision_product_set_create]

    # [START howto_operator_vision_product_set_get]
    product_set_get = CloudVisionGetProductSetOperator(
        location=GCP_VISION_LOCATION,
        product_set_id="{{ task_instance.xcom_pull('product_set_create') }}",
        task_id='product_set_get',
    )
    # [END howto_operator_vision_product_set_get]

    # [START howto_operator_vision_product_set_update]
    product_set_update = CloudVisionUpdateProductSetOperator(
Ejemplo n.º 20
0
from airflow import version
from airflow.exceptions import AirflowException
from airflow.providers.google.cloud.hooks.cloud_memorystore import CloudMemorystoreHook
from tests.providers.google.cloud.utils.base_gcp_mock import (
    GCP_PROJECT_ID_HOOK_UNIT_TEST,
    mock_base_gcp_hook_default_project_id,
    mock_base_gcp_hook_no_default_project_id,
)

TEST_GCP_CONN_ID = "test-gcp-conn-id"  # type: str
TEST_DELEGATE_TO = "test-delegate-to"  # type: str
TEST_LOCATION = "test-location"  # type: str
TEST_INSTANCE_ID = "test-instance-id"  # type: str
TEST_PROJECT_ID = "test-project-id"  # type:  str
TEST_RETRY = Retry()  # type: Retry
TEST_TIMEOUT = 10  # type: float
TEST_METADATA = [("KEY", "VALUE")]  # type:  Sequence[Tuple[str, str]]
TEST_PAGE_SIZE = 100  # type: int
TEST_UPDATE_MASK = {"paths": ["memory_size_gb"]}  # type: Dict
TEST_PARENT = "projects/test-project-id/locations/test-location"  # type: str
TEST_NAME = "projects/test-project-id/locations/test-location/instances/test-instance-id"  # type: str
TEST_PARENT_DEFAULT_PROJECT_ID = "projects/{}/locations/test-location".format(
    GCP_PROJECT_ID_HOOK_UNIT_TEST)  # type: str
TEST_NAME_DEFAULT_PROJECT_ID = "projects/{}/locations/test-location/instances/test-instance-id".format(
    GCP_PROJECT_ID_HOOK_UNIT_TEST)  # type: str


class TestCloudMemorystoreWithDefaultProjectIdHook(TestCase):
    def setUp(self, ):
        with mock.patch(
Ejemplo n.º 21
0
def test_documentreference_delete_w_retry_timeout():
    from google.api_core.retry import Retry

    retry = Retry(predicate=object())
    timeout = 123.0
    _delete_helper(retry=retry, timeout=timeout)
Ejemplo n.º 22
0
# Maximum number of mutations in bulk (MutateRowsRequest message):
# (https://cloud.google.com/bigtable/docs/reference/data/rpc/
#  google.bigtable.v2#google.bigtable.v2.MutateRowRequest)
_MAX_BULK_MUTATIONS = 100000
VIEW_NAME_ONLY = enums.Table.View.NAME_ONLY


class _BigtableRetryableError(Exception):
    """Retry-able error expected by the default retry strategy."""


DEFAULT_RETRY = Retry(
    predicate=if_exception_type(_BigtableRetryableError),
    initial=1.0,
    maximum=15.0,
    multiplier=2.0,
    deadline=120.0,  # 2 minutes
)
"""The default retry stategy to be used on retry-able errors.

Used by :meth:`~google.cloud.bigtable.table.Table.mutate_rows`.
"""


class TableMismatchError(ValueError):
    """Row from another table."""


class TooManyMutationsError(ValueError):
    """The number of mutations for bulk request is too big."""
Ejemplo n.º 23
0
def test_client_get_all_w_retry_timeout():
    from google.api_core.retry import Retry

    retry = Retry(predicate=object())
    timeout = 123.0
    _get_all_helper(retry=retry, timeout=timeout)