Example #1
0
def get_blobs_in_gcs_loc(
        client: storage.Client,
        bucket_name: str,
        bucket_path: str
):
    """
    Downloads a blob from given bucket
    :param client: gcs client
    :param bucket_name: gcs bucket name
    :param bucket_path: name of object in bucket
    :return: results with list of data from blobs
    """
    bucket = client.bucket(bucket_name)
    # Get name of results files in bucket location
    blob_names = [
        blob.name
        for blob in bucket.list_blobs(prefix=bucket_path)
        if blob.name != bucket_path
    ]

    results = []

    for blob_name in blob_names:
        blob_contents = (
            bucket.get_blob(blob_name).download_as_string().decode('utf-8')
        )

        results += json.loads(blob_contents)

    return results
Example #2
0
    def _write_async(self):
        if len(self.pending_records) == 0:
            return
        try:
            client = Client(project=DB_LOGGER_WRITE_TO_GCS_PROJECT_ID)
            bucket_path = f"{self.bucket_inner_path}/{self.filename}"
            if DB_LOGGER_WRITE_TO_GCS_MULTI_FILE_LOG:
                bucket_path = self._compose_progressing_log_file_name(
                    bucket_path)

            bucket = client.bucket(bucket_name=self.bucket_name)
            blob = bucket.blob(blob_name=bucket_path)

            records = self.pending_records
            self.pending_records = []

            if not DB_LOGGER_WRITE_TO_GCS_MULTI_FILE_LOG and blob.exists():
                current_log = blob.download_as_string().decode(
                    encoding="utf-8").strip()
                if current_log:
                    records.insert(0, current_log)
                # Reset the blob
                blob = bucket.blob(blob_name=bucket_path)

            blob.upload_from_string("\n".join(records))

        except Exception as err:
            airflow_db_logger_log.error(
                f"Failed to flash to bucket @ {self.bucket_name}/{self.bucket_inner_path}/{self.filename}"
            )
            airflow_db_logger_log.error(err)
Example #3
0
def hello(**kwargs):
    gcs = Client()
    bucket = gcs.bucket("data.visitdata.org")
    blob = bucket.blob("processed/hello/lastrun")
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    blob.upload_from_string(f"{timestamp}\n")
    print("Successfully wrote timestamp to bucket: {}".format(timestamp))
 def __download_bucket(storage_client: storage.Client, dst: str, url: str):
     filepath_parts = url.split("/")
     filename = filepath_parts[-1]
     bucket_name = "/".join(filepath_parts[2:-1])
     bucket = storage_client.bucket(bucket_name)
     blob = bucket.blob(filename)
     blob.download_to_filename(os.path.join(dst, filename))
  def _create_connection(self):
    client = Client(
      credentials=self.credentials,
      project=self.project,
    )

    return client.bucket(self.bucket)
Example #6
0
    def __init__(self, bucket_name, project=None, credentials=None):
        """
            Constructor

                :param bucket_name:
                    Name of the bucket that the files are on.

                :param project: the project which the client acts on behalf of. Will be
                                passed when creating a topic.  If not passed,
                                falls back to the default inferred from the environment.

                :param credentials: (Optional) The OAuth2 Credentials to use for this
                                    client. If not passed (and if no ``_http`` object is
                                    passed), falls back to the default inferred from the
                                    environment.

            Make sure the credentials have the correct permissions set up on
            Google Cloud or else GoogleStorage will return a 403 FORBIDDEN error.
        """
        if not Client:
            raise ValueError(
                'Could not import google.cloud.storage. You can install '
                'google.cloud.storage by using pip install google-cloud-storage'
            )

        connection = Client(project=project, credentials=credentials)
        self.bucket = connection.bucket(bucket_name)
Example #7
0
 def _remove_from_cloudstorage(self, blob_path: str):
     client = Client()
     bucket = client.bucket(self.BUCKET)
     try:  # don't fail entire task if this fails
         bucket.delete_blob(blob_path)
     except NotFound:
         print(f"{blob_path} not found")
Example #8
0
def _convert_ndjson_to_json(bucket_name: str, target_path: str, table: str,
                            storage_client: storage.Client, tmp: str):
    """Converts the provided ndjson file on GCS to json."""
    ndjson_blob_path = f"gs://{bucket_name}/{target_path}/{table}-{tmp}.ndjson"
    json_blob_path = f"gs://{bucket_name}/{target_path}/{table}-{tmp}.json"

    logger.info(f"Convert {ndjson_blob_path} to {json_blob_path}")

    # stream from GCS
    with smart_open.open(ndjson_blob_path) as fin:
        first_line = True

        with smart_open.open(json_blob_path, "w") as fout:
            fout.write("[")

            for line in fin:
                if not first_line:
                    fout.write(",")

                fout.write(line.replace("\n", ""))
                first_line = False

            fout.write("]")
            fout.close()
            fin.close()

    # delete ndjson file from bucket
    logger.info(f"Remove file {table}-{tmp}.ndjson")
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(f"{target_path}/{table}-{tmp}.ndjson")
    blob.delete()
    logger.info(f"Rename file {table}-{tmp}.json to {table}.json")
    bucket.rename_blob(bucket.blob(f"{target_path}/{table}-{tmp}.json"),
                       f"{target_path}/{table}.json")
Example #9
0
def main():
    args = parse_args()

    # Imports of thor modules are deferred until after argument parsing to avoid
    # numba JIT time if the arguments are invalid or the user asked for --help.
    import thor.utils.logging

    thor.utils.logging.setupLogger("thor")

    from thor.taskqueue.client import Client as TaskQueueClient
    from thor.taskqueue.queue import TaskQueueConnection
    from thor.orbits import Orbits
    from thor.config import Config

    if not isinstance(args.config, str):
        config = Config
    else:
        config = Config.fromYaml(args.config)

    # Read observations
    preprocessed_observations = pd.read_csv(args.preprocessed_observations,
                                            index_col=False,
                                            dtype={"obs_id": str})

    # Read test orbits
    test_orbits = Orbits.from_csv(args.test_orbits)

    # Connect to Rabbit
    queue = TaskQueueConnection(
        pika.ConnectionParameters(
            host=args.rabbit_host,
            port=args.rabbit_port,
            credentials=pika.PlainCredentials(
                username=args.rabbit_username,
                password=args.rabbit_password,
            ),
        ),
        args.queue,
    )
    queue.connect()

    # Connect to GCS bucket
    gcs = GCSClient()
    if args.create_bucket:
        try:
            gcs.create_bucket(args.bucket)
        except google.cloud.exceptions.Conflict:
            # Bucket already exists.
            pass
    bucket = gcs.bucket(args.bucket)
    taskqueue_client = TaskQueueClient(bucket, queue)

    manifest = taskqueue_client.launch_job(config, preprocessed_observations,
                                           test_orbits)
    taskqueue_client.monitor_job_status(manifest.job_id)
    taskqueue_client.download_results(manifest, args.out_dir)
Example #10
0
def upload_blob(bucket_name: str, source_file_contents: AnyStr,
                destination_blob_name: str, gcs_storage_client: storage.Client,
                logger: logging.Logger) -> None:
    """Uploads a file to the cloud"""
    # bucket_name = "your-bucket-name"
    # source_file_contents = "... some file contents..."
    # destination_blob_name = "storage/object/name"
    bucket = gcs_storage_client.bucket(bucket_name)
    logger.info(
        f'Uploading file content to gs://{bucket_name}/{destination_blob_name}...'
    )
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_string(source_file_contents)
Example #11
0
 def _uploadFile(credentials: dict, bucket_name: str, key: str, file_name: str) -> None:
     """
     Uploads a file to cloud storage.
     :param credentials: The Google cloud storage service credentials retrieved from the Kubernetes secret.
     :param bucket_name: The name of the bucket.
     :param key: The key to save the file in the cloud storage.
     :param file_name: The local file that will be uploaded.
     """
     credentials = ServiceCredentials.from_service_account_info(credentials)
     gcs_client = StorageClient(credentials.project_id, credentials)
     bucket = gcs_client.bucket(bucket_name)
     bucket.blob(key).upload_from_filename(file_name)
     logging.info("Backup uploaded to gcs://%s/%s", bucket_name, key)
Example #12
0
def load_merged_weather_for_state_for_date(bucket_name: str,
                                           bucket_raw_base_path: str,
                                           bucket_merged_base_path: str,
                                           selected_state: str,
                                           start_date: datetime.date,
                                           **context):
    end_date: datetime.date = context["execution_date"]
    yyyymmdd: str = end_date.strftime("%Y%m%d")
    gcs = Client()
    bucket = gcs.bucket(bucket_name)

    # Dict like {"county": {"location": {...}, "forecast": {"yyyymmdd1": {...}, "yyyymmdd2": {...}}}}
    merged_weather = defaultdict(lambda: {"forecast": {}})

    # Read raw data for each date from first date to execution date and merge into one record
    for n in range(int((end_date - start_date).days) + 1):
        date = start_date + timedelta(n)
        state_data = read_weather_for_state_for_date(
            bucket=bucket,
            bucket_raw_base_path=bucket_raw_base_path,
            selected_state=selected_state,
            date=date)
        if state_data is None:
            print(
                f"Warning: No data for state {selected_state} for date {date}. Skipping."
            )
            continue

        for county in state_data.keys():
            print(f"Merging {county}, {selected_state} on {date}...")
            data = state_data[county]
            try:
                forecast = data["forecast"]["forecastday"][0]
                county_weather = merged_weather[county]
                county_weather["forecast"][forecast["date_epoch"]] = {
                    "maxtemp_f": forecast["day"]["maxtemp_f"],
                    "totalprecip_in": forecast["day"]["totalprecip_in"]
                }
            except Exception as e:
                print(
                    f"Skipping state {selected_state}, county {county}, date {date} due to error: {e}"
                )

    target_blob = bucket.blob(
        f"{bucket_merged_base_path.format(date=yyyymmdd)}/{selected_state}.json"
    )
    target_blob.upload_from_string(json.dumps(merged_weather, sort_keys=True))
    print(
        f"Successfully loaded merged weather data for state {selected_state} to bucket"
    )
Example #13
0
def load_stats_raw(
    storage_client: storage.Client,
    datacatalog: data_catalog.Client,
    region: str,
    entry_group: datacatalog_v1.EntryGroup,
    tag_template: datacatalog_v1.TagTemplate,
    app_id: str,
) -> Optional[io.BytesIO]:
    bucket_name = "ja-kakei-chousa-raw"
    bucket = storage_client.bucket(bucket_name)
    if not bucket.exists():
        bucket.create(location=region)

    entry_id = "ja_kakei_chousa_raw"
    entry = datacatalog.get_entry(entry_group, entry_id)
    if entry is None:
        entry = datacatalog_v1.types.Entry()
        entry.display_name = bucket_name
        entry.gcs_fileset_spec.file_patterns.append(
            f"gs://{bucket_name}/*.json")
        entry.type_ = datacatalog_v1.EntryType.FILESET
        entry = datacatalog.create_entry(entry_group, entry_id, entry)

    tag = datacatalog.get_tag(entry)
    if tag is None:
        tag = datacatalog_v1.types.Tag()
        tag.template = tag_template.name
        tag.fields["data_sources"] = datacatalog_v1.types.TagField()
        tag.fields[
            "data_sources"].string_value = "家計調査 https://www.e-stat.go.jp/stat-search/database?page=1&layout=datalist&toukei=00200561&tstat=000000330001&cycle=7&tclass1=000000330001&tclass2=000000330004&tclass3val=0"
        tag.fields["license"] = datacatalog_v1.types.TagField()
        tag.fields[
            "license"].string_value = "利用規約に従って複製、公衆送信、翻訳・変形等の翻案等、自由に利用できます。商用利用も可能です。 https://www.e-stat.go.jp/terms-of-use"
        tag = datacatalog.create_tag(entry, tag=tag)
    tag = datacatalog.set_status_running(tag)

    raw_dir = bucket.blob("income-divide-over-two-member-2020.json")
    res = requests.get(
        "http://api.e-stat.go.jp/rest/3.0/app/json/getStatsData?appId=%s&lang=J&statsDataId=0002070005&metaGetFlg=Y&cntGetFlg=N&explanationGetFlg=Y&annotationGetFlg=Y&sectionHeaderFlg=1"
        % app_id)
    content = None
    if res.status_code == 200:
        content = io.BytesIO(res.content)
    res.close()
    raw_dir.upload_from_file(content)
    content.seek(0)
    tag = datacatalog.set_status_completed(tag)

    return content
Example #14
0
def download_file_from_gcs(file_path: str, save_path: str,
                           gcs_client: storage.Client):
    """Downloads the dataset provided with the paper from Google Cloud Storage.

  Args:
    file_path: Path to the file on GCS (without the bucket) to be downloaded.
    save_path: Path where the dataset should be downloaded to. Can point to a
      /tmp path in order for it to be cleaned up automatically.
    gcs_client: GCS client.
  """
    save_dir = save_path.rsplit("/", 1)[0]
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    bucket = gcs_client.bucket(_EOSCIENCE_PUBLIC_BUCKET)
    blob = bucket.blob(file_path)
    blob.download_to_filename(save_path)
Example #15
0
def _upload_one_file(client: storage.Client, src_file: str, dest_gs_url: str):
    dest_parsed_url = urlparse(dest_gs_url)
    dest_bucketname = dest_parsed_url.hostname
    dest_bucket = client.bucket(dest_bucketname)
    src_parsed_url = urlparse(src_file)
    dest_parsed_url = urlparse(dest_gs_url)

    if src_parsed_url.scheme == "":  # local file upload

        def once():
            blob = dest_bucket.blob(
                str(Path(
                    dest_parsed_url.path[1:],
                    Path(src_file).name,
                )))
            blob.upload_from_filename(src_file)

    elif src_parsed_url.scheme == "gs":  # copy between gcs buckets

        def once():
            src_bucketname = src_parsed_url.hostname
            src_bucket = client.bucket(src_bucketname)
            path = src_parsed_url.path
            src_blob = src_bucket.blob(path[1:])
            src_bucket.copy_blob(
                src_blob,
                dest_bucket,
                str(
                    Path(dest_parsed_url.path[1:]) /
                    Path(src_parsed_url.path).name),
            )

    else:
        raise ValueError(f"Cannot handle src file {src_file}")

    for attempt in range(3):
        try:
            logging.info(f"uploading file {src_file} to {dest_gs_url}")
            once()
        except GoogleCloudError as e:
            logging.warning(
                f"Copy {src_file} to {dest_gs_url} failed: attempt {attempt}",
                e)
        else:
            return

    raise RuntimeError(f"copying file {src_file} to {dest_gs_url} failed")
Example #16
0
def load_stats_raw(
    storage_client: storage.Client,
    datacatalog: data_catalog.Client,
    region: str,
    entry_group: datacatalog_v1.EntryGroup,
    tag_template: datacatalog_v1.TagTemplate,
    target_year: int,
    target_path: str,
) -> Optional[io.BytesIO]:
    bucket_name = "jasso-gakuseiseikatsu-stats-raw"
    bucket = storage_client.bucket(bucket_name)
    if not bucket.exists():
        bucket.create(location=region)

    entry_id = "jasso_gakuseiseikatsu_stats_raw"
    entry = datacatalog.get_entry(entry_group, entry_id)
    if entry is None:
        entry = datacatalog_v1.types.Entry()
        entry.display_name = bucket_name
        entry.gcs_fileset_spec.file_patterns.append(
            f"gs://{bucket_name}/*.xlsx")
        entry.type_ = datacatalog_v1.EntryType.FILESET
        entry = datacatalog.create_entry(entry_group, entry_id, entry)

    tag = datacatalog.get_tag(entry)
    if tag is None:
        tag = datacatalog_v1.types.Tag()
        tag.template = tag_template.name
        tag.fields["data_sources"] = datacatalog_v1.types.TagField()
        tag.fields[
            "data_sources"].string_value = "JASSO学生生活調査 https://www.jasso.go.jp/about/statistics/gakusei_chosa/index.html"
        tag.fields["license"] = datacatalog_v1.types.TagField()
        tag.fields[
            "license"].string_value = "日本学生支援機構が「学生生活調査」「高等専門学校生生活調査」「専修学校生生活調査」の結果として公開している情報は、出典の記載をしていただいた上で、どなたでも自由に利用できます。 https://www.jasso.go.jp/about/statistics/gakusei_chosa/riyou.html"
        tag = datacatalog.create_tag(entry, tag=tag)
    tag = datacatalog.set_status_running(tag)

    raw_dir = bucket.blob(f"data_{target_year}.xlsx")
    res = requests.get(target_path, )
    content = None
    if res.status_code == 200:
        content = io.BytesIO(res.content)
    res.close()
    raw_dir.upload_from_file(content)
    tag = datacatalog.set_status_completed(tag)

    return content
Example #17
0
class ImageUploader:
    def __init__(self,
                 bucket: str,
                 account_info: Union[Client, str, Dict],
                 timeout: int = 15):
        self.account_info = account_info
        self.timeout = timeout
        self._client, self._project = ((account_info, None) if isinstance(
            account_info, Client) else self._generate_client(account_info))

        self.client = Client(credentials=self._client, project=self._project)
        self.bucket = self.client.bucket(bucket)

    def _generate_client(self, info: Union[Dict, str]) -> Tuple[Client, str]:
        if not isinstance(info, dict):
            info = json.loads(info)

        # The type key should indicate that the file is either a service account
        # credentials file or an authorized user credentials file.
        credential_type = info.get("type")
        if credential_type != _SERVICE_ACCOUNT_TYPE:
            raise ValueError(
                f'Invalid credential type "{credential_type}". Generating signed URLs requires a service account with appropriate permissions.'
            )

        from google.oauth2 import service_account

        return service_account.Credentials.from_service_account_info(
            info), info.get("project_id")

    def _get_signed_url(self, obj: str):
        return generate_download_signed_url_v4(self.bucket, obj, self.timeout)

    def signed_url_from_file(self, path: str):
        temp_file_name = str(uuid4())
        blob = self.bucket.blob(temp_file_name)
        blob.upload_from_filename(path)
        return self._get_signed_url(temp_file_name)

    def signed_url_from_string(self, string, type: str = "png"):
        temp_file_name = str(uuid4())
        blob = self.bucket.blob(temp_file_name)
        blob.upload_from_string(string, content_type=f"application/{type}")
        return self._get_signed_url(temp_file_name)
Example #18
0
def load_raw_weather_for_state_for_date(base_url: str, api_key: str,
                                        bucket_name: str,
                                        bucket_raw_base_path: str,
                                        selected_state: str, **context):
    date: datetime.date = context["execution_date"]
    yyyymmdd: str = date.strftime("%Y%m%d")
    gcs = Client()
    bucket = gcs.bucket(bucket_name)
    blob = bucket.blob(
        f"{bucket_raw_base_path.format(date=yyyymmdd)}/{selected_state}.json.gz"
    )
    blob.upload_from_string(
        gzip_str(
            json.dumps(
                extract_raw_weather_for_state_for_date(
                    base_url=base_url,
                    api_key=api_key,
                    selected_state=selected_state,
                    date=date))))
    print(f"Successfully loaded weather data for {str(date)} to bucket")
Example #19
0
    def _create_connection(self, secrets=None, endpoint=None):
        if secrets is None:
            secrets = self.credentials

        if isinstance(secrets, str):
            secrets = json.loads(secrets)
        if isinstance(secrets, dict):
            secrets = service_account.Credentials.from_service_account_info(
                secrets)

        if endpoint is not None:
            raise ValueError(
                "The endpoint argument is not supported for Google Cloud Storage. Got: "
                + str(endpoint))

        client = Client(
            credentials=secrets,
            project=self.project,
        )

        return client.bucket(self.bucket, user_project=self.request_payer)
Example #20
0
class GCStorage(RemoteStorageABC):
    def __init__(self, project: str = "mathieu-tricicl", bucket_name: str = "tricicl-public"):
        self.client = Client(project=project)
        self.bucket = self.client.bucket(bucket_name)

    def upload_file(self, sync_path: SyncPath):
        blob = self.bucket.blob(str(sync_path.remote), chunk_size=10 * 1024 * 1024)
        blob.upload_from_filename(str(sync_path.local), timeout=60 * 5)

    def download_file(self, sync_path: SyncPath):
        sync_path.local.parent.mkdir(exist_ok=True, parents=True)
        blob = self.bucket.get_blob(str(sync_path.remote))
        if blob is None:
            raise FileNotFoundError(f"{sync_path.remote} is not on gcloud bucket")
        blob.download_to_filename(str(sync_path.local), timeout=60 * 5)

    def list_files(self, remote_path: PurePath, suffix: str = "") -> List[PurePath]:
        return [
            PurePath(b.name)
            for b in self.client.list_blobs(self.bucket, prefix=str(remote_path))
            if not b.name.endswith("/") and b.name.endswith(suffix)
        ]
def create_lock_files(integrations_generation_number: dict,
                      storage_client: storage.Client,
                      integrations_details: list, test_timeout: int) -> bool:
    """
    This method tries to create a lock files for all integrations specified in 'integrations_generation_number'.
    Each file should contain <circle-ci-build-number>:<test-timeout>
    where the <circle-ci-build-number> part is for debugging and troubleshooting
    and the <test-timeout> part is to be able to unlock revoked test files.
    If for any of the integrations, the lock file creation will fail- the already created files will be cleaned.
    Args:
        integrations_generation_number: A dict in the form of {<integration-name>:<integration-generation>}
        storage_client: The GCP storage client
        integrations_details: List of current test's integrations
        test_timeout: The time out

    Returns:

    """
    locked_integrations = []
    bucket = storage_client.bucket(BUCKET_NAME)
    for integration, generation_number in integrations_generation_number.items(
    ):
        blob = bucket.blob(f'{LOCKS_PATH}/{integration}')
        try:
            blob.upload_from_string(
                f'{WORKFLOW_ID}:{CIRCLE_BUILD_NUM}:{test_timeout + 30}',
                if_generation_match=generation_number)
            logging_manager.debug(f'integration {integration} locked')
            locked_integrations.append(integration)
        except PreconditionFailed:
            # if this exception occurs it means that another build has locked this integration
            # before this build managed to do it.
            # we need to unlock all the integrations we have already locked and try again later
            logging_manager.warning(
                f'Could not lock integration {integration}, Create file with precondition failed.'
                f'delaying test execution.')
            unlock_integrations(integrations_details, storage_client)
            return False
    return True
Example #22
0
def write_to_gcs(out_data: str, out_path: str, creds_file: Optional[str]):
    from google.cloud.exceptions import Forbidden
    from google.cloud.storage import Client
    from urllib.parse import urlparse

    if creds_file is not None:
        client = Client.from_service_account_json(creds_file)
    else:
        client = Client()
    url = urlparse(out_path)
    bucket = client.bucket(url.netloc)
    # Stripping the leading /
    blob: "Blob"
    blob = bucket.blob(url.path[1:])
    try:
        blob.upload_from_string(out_data, content_type="application/json")
    except Forbidden as e:
        click.secho(
            f"Unable to write to {out_path}, permission denied:\n"
            f"{e.response.json()['error']['message']}",
            err=True,
        )
        sys.exit(1)
    blob.make_public()
Example #23
0
def download_blob(client: storage.Client, obj_name: str, bucket_name: str) -> bytes:
    return storage.blob.Blob(obj_name, client.bucket(bucket_name)).download_as_string()
Example #24
0
class BucketClientGCS(BucketClient):
    client: Optional[GCSNativeClient]

    @property
    def client_params(self) -> Any:
        return dict(client=self.client)

    def __init__(self, **kwargs: Any) -> None:
        self.recreate(**kwargs)

    def recreate(self, **kwargs: Any) -> None:
        creds = kwargs["credentials"] if "credentials" in kwargs else None
        if creds is not None:
            kwargs["project"] = creds.project_id
        try:
            self.client = GCSNativeClient(**kwargs)
        except TypeError:
            # TypeError is raised if the imports for GCSNativeClient fail and are
            #  assigned to Any, which is not callable.
            self.client = None

    def make_uri(self, path: PurePathy) -> str:
        return str(path)

    def create_bucket(self, path: PurePathy) -> Bucket:
        assert self.client is not None, _MISSING_DEPS
        return self.client.create_bucket(path.root)

    def delete_bucket(self, path: PurePathy) -> None:
        assert self.client is not None, _MISSING_DEPS
        bucket = self.client.get_bucket(path.root)
        bucket.delete()

    def exists(self, path: PurePathy) -> bool:
        # Because we want all the parents of a valid blob (e.g. "directory" in
        # "directory/foo.file") to return True, we enumerate the blobs with a prefix
        # and compare the object names to see if they match a substring of the path
        key_name = str(path.key)
        try:
            for obj in self.list_blobs(path):
                if obj.name == key_name:
                    return True
                if obj.name.startswith(key_name + path._flavour.sep):
                    return True
        except gcs_errors.ClientError:
            return False
        return False

    def lookup_bucket(self, path: PurePathy) -> Optional[BucketGCS]:
        assert self.client is not None, _MISSING_DEPS
        try:
            native_bucket = self.client.bucket(path.root)
            if native_bucket is not None:
                return BucketGCS(str(path.root), bucket=native_bucket)
        except gcs_errors.ClientError as err:
            print(err)

        return None

    def get_bucket(self, path: PurePathy) -> BucketGCS:
        assert self.client is not None, _MISSING_DEPS
        try:
            native_bucket = self.client.bucket(path.root)
            if native_bucket is not None:
                return BucketGCS(str(path.root), bucket=native_bucket)
            raise FileNotFoundError(f"Bucket {path.root} does not exist!")
        except gcs_errors.ClientError as e:
            raise ClientError(message=e.message, code=e.code)

    def list_buckets(
        self, **kwargs: Dict[str, Any]
    ) -> Generator[GCSNativeBucket, None, None]:
        assert self.client is not None, _MISSING_DEPS
        return self.client.list_buckets(**kwargs)  # type:ignore

    def scandir(  # type:ignore[override]
        self,
        path: Optional[PurePathy] = None,
        prefix: Optional[str] = None,
        delimiter: Optional[str] = None,
    ) -> PathyScanDir:
        return _GCSScanDir(client=self, path=path, prefix=prefix, delimiter=delimiter)

    def list_blobs(
        self,
        path: PurePathy,
        prefix: Optional[str] = None,
        delimiter: Optional[str] = None,
        include_dirs: bool = False,
    ) -> Generator[BlobGCS, None, None]:
        assert self.client is not None, _MISSING_DEPS
        continuation_token = None
        bucket = self.lookup_bucket(path)
        if bucket is None:
            return
        while True:
            if continuation_token:
                response = self.client.list_blobs(
                    path.root,
                    prefix=prefix,
                    delimiter=delimiter,
                    page_token=continuation_token,
                )
            else:
                response = self.client.list_blobs(
                    path.root, prefix=prefix, delimiter=delimiter
                )
            for page in response.pages:
                for item in page:
                    yield BlobGCS(
                        bucket=bucket,
                        owner=item.owner,
                        name=item.name,
                        raw=item,
                        size=item.size,
                        updated=item.updated.timestamp(),
                    )
            if response.next_page_token is None:
                break
            continuation_token = response.next_page_token
Example #25
0
def load_income_stats(
    storage_client: storage.Client,
    datacatalog: data_catalog.Client,
    region: str,
    entry_group: datacatalog_v1.EntryGroup,
    tag_template: datacatalog_v1.TagTemplate,
    content: Optional[io.BytesIO],
) -> pd.DataFrame:
    bucket_name = "ja-kakei-chousa-income-divide-over-two-member"
    bucket = storage_client.bucket(bucket_name)
    if not bucket.exists():
        bucket.create(location=region)

    entry_id = "ja_kakei_chousa_income_divide_over_two_member"
    entry = datacatalog.get_entry(entry_group, entry_id)
    if entry is None:
        entry = datacatalog_v1.types.Entry()
        entry.display_name = bucket_name
        entry.gcs_fileset_spec.file_patterns.append(
            f"gs://{bucket_name}/*.parquet")
        entry.type_ = datacatalog_v1.EntryType.FILESET

        columns = []
        columns.append(
            datacatalog_v1.types.ColumnSchema(
                column="aggregatioon_category",
                type_="STRING",
                mode="REQUIRED",
                description="集計カテゴリ",
            ))

        columns.append(
            datacatalog_v1.types.ColumnSchema(
                column="houseshold_type",
                type_="STRING",
                mode="REQUIRED",
                description="世帯種別",
            ))

        columns.append(
            datacatalog_v1.types.ColumnSchema(
                column="quintile",
                type_="STRING",
                mode="REQUIRED",
                description="年間収入5分位の位置およびすべての世帯の平均のいずれか",
            ))

        columns.append(
            datacatalog_v1.types.ColumnSchema(column="value",
                                              type_="DOUBLE",
                                              mode="REQUIRED",
                                              description="各集計値"))

        entry.schema.columns.extend(columns)
        entry = datacatalog.create_entry(entry_group, entry_id, entry)

    tag = datacatalog.get_tag(entry)
    if tag is None:
        tag = datacatalog_v1.types.Tag()
        tag.template = tag_template.name
        tag.fields["data_sources"] = datacatalog_v1.types.TagField()
        tag.fields["data_sources"].string_value = "gs://ja-kakei-chousa-raw/"
        tag.fields["license"] = datacatalog_v1.types.TagField()
        tag.fields[
            "license"].string_value = "利用規約に従って複製、公衆送信、翻訳・変形等の翻案等、自由に利用できます。商用利用も可能です。 https://www.e-stat.go.jp/terms-of-use"
        tag = datacatalog.create_tag(entry, tag=tag)
    tag = datacatalog.set_status_running(tag)

    raw = json.load(content)
    classes = {}
    class_names = {}
    for c in raw["GET_STATS_DATA"]["STATISTICAL_DATA"]["CLASS_INF"][
            "CLASS_OBJ"]:
        if c["@id"] in ["tab", "area"]:
            continue
        classes[c["@id"]] = {}
        if c["@id"] == "cat01":
            class_names[c["@id"]] = "aggregation_type"
        if c["@id"] == "cat02":
            class_names[c["@id"]] = "houseshold_type"
        if c["@id"] == "cat03":
            class_names[c["@id"]] = "quintile"
        if c["@id"] == "time":
            class_names[c["@id"]] = "year"
        if not isinstance(c["CLASS"], list):
            code = c["CLASS"]
            classes[c["@id"]][code["@code"]] = code["@name"]
        else:
            for code in c["CLASS"]:
                classes[c["@id"]][code["@code"]] = code["@name"]
                if c["@id"] == "time":
                    classes[c["@id"]][code["@code"]] = int(
                        classes[c["@id"]][code["@code"]].rstrip("年"))

    def to_num(s: str) -> float:
        if s.isdecimal():
            return float(s)
        return float("nan")

    data = []

    for v in raw["GET_STATS_DATA"]["STATISTICAL_DATA"]["DATA_INF"]["VALUE"]:
        val = []
        for c, codes in classes.items():
            if "@" + c in v:
                val.append(codes[v["@" + c]])
        val.append(to_num(v["$"]))
        data.append(val)
    columns = list(map(lambda k: class_names[k], classes.keys()))
    columns.append("value")
    df = pd.DataFrame(data, columns=columns)

    content = io.BytesIO()
    df.to_parquet(content)
    content.seek(0)
    raw_dir = bucket.blob("acquisition_year=2020/data.parquet")
    raw_dir.upload_from_file(content)
    tag = datacatalog.set_status_completed(tag)
    return df
Example #26
0
    def get_gcp_storage_client_bucket(self) -> Bucket:
        credentials = Credentials.from_service_account_info(settings.GCP_BUCKET_CREDENTIALS)
        project_id = settings.GCP_BUCKET_CREDENTIALS.get("project_id")
        storage_client = Client(credentials=credentials, project=project_id)

        return storage_client.bucket(settings.GCP_BUCKET_NAME)
BASE_URL = "http://api.weatherapi.com/v1/history.json?key={}&q={}+united+states&dt={}"
# Get a weatherapi.com api key
API_KEY = os.environ.get("API_WEATHER_KEY", "a70a4e2736644cdcb9d85348202404")
BUCKET_NAME = os.environ.get("BUCKET_NAME", "default")

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': days_ago(1),
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

gcs = Client()
bucket = gcs.bucket(BUCKET_NAME)
state_file = bucket.get_blob("states_counties.json")
STATES = json.loads(state_file.download_as_string())


def slugify_state(state):
    return "-".join(state.split())


def get_weather_data(query):
    weather = {}
    weather["forecast"] = {}
    date = datetime.today()
    full_url = BASE_URL.format(API_KEY, query, date.strftime('%Y-%m-%d'))
    response = requests.get(full_url)
    data = response.json()
class DssUploader:
    def __init__(self, dss_endpoint: str, staging_bucket: str,
                 google_project_id: str, dry_run: bool) -> None:
        """
        Functions for uploading files to a given DSS.

        :param dss_endpoint: The URL to a Swagger DSS API.  e.g. "https://commons-dss.ucsc-cgp-dev.org/v1"
        :param staging_bucket: The name of the AWS S3 bucket to be used when staging files for uploading
        to the DSS. As an example, local files are uploaded to the staging bucket, then file metadata tags
        required by the DSS are assigned to it, then the file is loaded into the DSS (by copy).
        The bucket must be accessible by the DSS. .e.g. 'commons-dss-upload'
        :param google_project_id: A Google `Project ID` to be used when accessing GCP requester pays buckets.
        e.g. "platform-dev-178517"
        One way to find a `Project ID` is provided here:
        https://console.cloud.google.com/cloud-resource-manager
        :param dry_run: If True, log the actions that would be performed yet don't actually execute them.
        Otherwise, actually perform the operations.
        """
        self.dss_endpoint = dss_endpoint
        self.staging_bucket = staging_bucket
        self.google_project_id = google_project_id
        self.dry_run = dry_run
        self.s3_client = boto3.client("s3")
        self.s3_blobstore = s3.S3BlobStore(self.s3_client)
        self.gs_client = Client()

        # Work around problems with DSSClient initialization when there is
        # existing HCA configuration. The following issue has been submitted:
        # Problems accessing an alternate DSS from user scripts or unit tests #170
        # https://github.com/HumanCellAtlas/dcp-cli/issues/170
        monkey_patch_hca_config()
        HCAConfig._user_config_home = '/tmp/'
        dss_config = HCAConfig(name='loader',
                               save_on_exit=False,
                               autosave=False)
        dss_config[
            'DSSClient'].swagger_url = f'{self.dss_endpoint}/swagger.json'
        self.dss_client = DSSClient(config=dss_config)

    def upload_cloud_file_by_reference(self,
                                       filename: str,
                                       file_uuid: str,
                                       file_cloud_urls: set,
                                       bundle_uuid: str,
                                       guid: str,
                                       file_version: str = None) -> tuple:
        """
        Loads the given cloud file into the DSS by reference, rather than by copying it into the DSS.
        Because the HCA DSS per se does not support loading by reference, this is currently implemented
        using the approach described here:
        https://docs.google.com/document/d/1QSa7Ubw-muyD_u0X_dq9WeKyK_dCJXi4Ex7S_pil1uk/edit#heading=h.exnqjy2n2q78

        This is conceptually similar to creating a "symbolic link" to the cloud file rather than copying the
        source file into the DSS.
        The file's metadata is obtained, formatted as a dictionary, then this dictionary is uploaded as
        as a json file with content type `dss-type=fileref` into the DSS.

        A request has been made for the HCA data-store to support loading by reference as a feature of the
        data store, here: https://github.com/HumanCellAtlas/data-store/issues/912

        :param filename: The name of the file in the bucket.
        :param file_uuid: An RFC4122-compliant UUID to be used to identify the file
        :param file_cloud_urls: A set of 'gs://' and 's3://' bucket links.
                                e.g. {'gs://broad-public-datasets/g.bam', 's3://ucsc-topmed-datasets/a.bam'}
        :param bundle_uuid: n RFC4122-compliant UUID to be used to identify the bundle containing the file
        :param guid: An optional additional/alternate data identifier/alias to associate with the file
        e.g. "dg.4503/887388d7-a974-4259-86af-f5305172363d"
        :param file_version: a RFC3339 compliant datetime string
        :return: file_uuid: str, file_version: str, filename: str, already_present: bool
        """
        def _create_file_reference(file_cloud_urls: set, guid: str) -> dict:
            """
            Format a file's metadata into a dictionary for uploading as a json to support the approach
            described here:
            https://docs.google.com/document/d/1QSa7Ubw-muyD_u0X_dq9WeKyK_dCJXi4Ex7S_pil1uk/edit#heading=h.exnqjy2n2q78

            :param file_cloud_urls: A set of 'gs://' and 's3://' bucket links.
                                    e.g. {'gs://broad-public-datasets/g.bam', 's3://ucsc-topmed-datasets/a.bam'}
            :param guid: An optional additional/alternate data identifier/alias to associate with the file
            e.g. "dg.4503/887388d7-a974-4259-86af-f5305172363d"
            :param file_version: RFC3339 formatted timestamp.
            :return: A dictionary of metadata values.
            """
            s3_metadata = None
            gs_metadata = None
            for cloud_url in file_cloud_urls:
                url = urlparse(cloud_url)
                bucket = url.netloc
                key = url.path[1:]
                if url.scheme == "s3":
                    s3_metadata = _get_s3_file_metadata(bucket, key)
                elif url.scheme == "gs":
                    gs_metadata = _get_gs_file_metadata(bucket, key)
                else:
                    raise FileURLError(
                        "Unsupported cloud URL scheme: {cloud_url}")
            return _consolidate_metadata(file_cloud_urls, s3_metadata,
                                         gs_metadata, guid)

        def _get_s3_file_metadata(bucket: str, key: str) -> dict:
            """
            Format an S3 file's metadata into a dictionary for uploading as a json.

            :param bucket: Name of an S3 bucket
            :param key: S3 file to upload.  e.g. 'output.txt' or 'data/output.txt'
            :return: A dictionary of metadata values.
            """
            metadata = dict()
            try:
                response = self.s3_client.head_object(Bucket=bucket,
                                                      Key=key,
                                                      RequestPayer="requester")
                metadata['content-type'] = response['ContentType']
                metadata['s3_etag'] = response['ETag']
                metadata['size'] = response['ContentLength']
            except Exception as e:
                raise FileURLError(
                    f"Error accessing s3://{bucket}/{key}") from e
            return metadata

        def _get_gs_file_metadata(bucket: str, key: str) -> dict:
            """
            Format a GS file's metadata into a dictionary for uploading as a JSON file.

            :param bucket: Name of a GS bucket.
            :param key: GS file to upload.  e.g. 'output.txt' or 'data/output.txt'
            :return: A dictionary of metadata values.
            """
            metadata = dict()
            try:
                gs_bucket = self.gs_client.bucket(bucket,
                                                  self.google_project_id)
                blob_obj = gs_bucket.get_blob(key)
                metadata['content-type'] = blob_obj.content_type
                metadata['crc32c'] = binascii.hexlify(
                    base64.b64decode(blob_obj.crc32c)).decode("utf-8").lower()
                metadata['size'] = blob_obj.size
            except Exception as e:
                raise FileURLError(
                    f"Error accessing gs://{bucket}/{key}") from e
            return metadata

        def _consolidate_metadata(file_cloud_urls: set,
                                  s3_metadata: Optional[Dict[str, Any]],
                                  gs_metadata: Optional[Dict[str, Any]],
                                  guid: str) -> dict:
            """
            Consolidates cloud file metadata to create the JSON used to load by reference
            into the DSS.

            :param file_cloud_urls: A set of 'gs://' and 's3://' bucket URLs.
                                    e.g. {'gs://broad-public-datasets/g.bam', 's3://ucsc-topmed-datasets/a.bam'}
            :param s3_metadata: Dictionary of meta data produced by _get_s3_file_metadata().
            :param gs_metadata: Dictionary of meta data produced by _get_gs_file_metadata().
            :param guid: An optional additional/alternate data identifier/alias to associate with the file
            e.g. "dg.4503/887388d7-a974-4259-86af-f5305172363d"
            :return: A dictionary of cloud file metadata values
            """
            consolidated_metadata = dict()
            if s3_metadata:
                consolidated_metadata.update(s3_metadata)
            if gs_metadata:
                consolidated_metadata.update(gs_metadata)
            consolidated_metadata['url'] = list(file_cloud_urls)
            consolidated_metadata['aliases'] = [str(guid)]
            return consolidated_metadata

        if self.dry_run:
            logger.info(
                f"DRY RUN: upload_cloud_file_by_reference: {filename} {str(file_cloud_urls)} {bundle_uuid}"
            )

        file_reference = _create_file_reference(file_cloud_urls, guid)
        return self.upload_dict_as_file(
            file_reference,
            filename,
            file_uuid,
            bundle_uuid,
            file_version=file_version,
            content_type="application/json; dss-type=fileref")

    def upload_dict_as_file(
            self,
            value: dict,
            filename: str,
            file_uuid: str,
            bundle_uuid: str,
            file_version: str = None,  # RFC3339
            content_type=None):
        """
        Create a JSON file in the DSS containing the given dict.

        :param value: A dictionary representing the JSON content of the file to be created.
        :param filename: The basename of the file in the bucket.
        :param file_uuid: An RFC4122-compliant UUID to be used to identify the file
        :param bundle_uuid: An RFC4122-compliant UUID to be used to identify the bundle containing the file
        :param content_type: Content description e.g. "application/json; dss-type=fileref".
        :param file_version: a RFC3339 compliant datetime string
        :return: file_uuid: str, file_version: str, filename: str, already_present: bool
        """
        tempdir = mkdtemp()
        file_path = "/".join([tempdir, filename])
        with open(file_path, "w") as fh:
            fh.write(json.dumps(value, indent=4))
        result = self.upload_local_file(file_path,
                                        file_uuid,
                                        bundle_uuid,
                                        file_version=file_version,
                                        content_type=content_type)
        os.remove(file_path)
        os.rmdir(tempdir)
        return result

    def upload_local_file(self,
                          path: str,
                          file_uuid: str,
                          bundle_uuid: str,
                          file_version: str = None,
                          content_type=None):
        """
        Upload a file from the local file system to the DSS.

        :param path: Path to a local file.
        :param file_uuid: An RFC4122-compliant UUID to be used to identify the file
        :param bundle_uuid: An RFC4122-compliant UUID to be used to identify the bundle containing the file
        :param content_type: Content type identifier, for example: "application/json; dss-type=fileref".
        :param file_version: a RFC3339 compliant datetime string
        :return: file_uuid: str, file_version: str, filename: str, already_present: bool
        """
        file_uuid, key = self._upload_local_file_to_staging(
            path, file_uuid, content_type)
        return self._upload_tagged_cloud_file_to_dss_by_copy(
            self.staging_bucket,
            key,
            file_uuid,
            bundle_uuid,
            file_version=file_version)

    def load_bundle(self, file_info_list: list, bundle_uuid: str):
        """
        Loads a bundle to the DSS that contains the specified files.

        :param file_info_list:
        :param bundle_uuid: An RFC4122-compliant UUID to be used to identify the bundle containing the file
        :return: A full qualified bundle id e.g. "{bundle_uuid}.{version}"
        """
        kwargs = dict(replica="aws",
                      creator_uid=CREATOR_ID,
                      files=file_info_list,
                      uuid=bundle_uuid,
                      version=tz_utc_now())
        if not self.dry_run:
            response = self.dss_client.put_bundle(**kwargs)
            version = response['version']
        else:
            logger.info("DRY RUN: DSS put bundle: " + str(kwargs))
            version = None
        bundle_fqid = f"{bundle_uuid}.{version}"
        logger.info(f"Loaded bundle: {bundle_fqid}")
        return bundle_fqid

    @staticmethod
    def get_filename_from_key(key: str):
        assert not key.endswith(
            '/'
        ), 'Please specify a filename, not a directory ({} cannot end in "/").'.format(
            key)
        return key.split("/")[-1]

    def _upload_local_file_to_staging(self, path: str, file_uuid: str,
                                      content_type):
        """
        Upload a local file to the staging bucket, computing the DSS-required checksums
        in the process, then tag the file in the staging bucket with the checksums.
        This is in preparation from subsequently uploading the file from the staging
        bucket into the DSS.

        :param path: Path to a local file.
        :param file_uuid: An RFC4122-compliant UUID to be used to identify the file.
        :param content_type: Content description, for example: "application/json; dss-type=fileref".
        :return: file_uuid: str, key_name: str
        """
        def _encode_tags(tags):
            return [dict(Key=k, Value=v) for k, v in tags.items()]

        def _mime_type(filename):
            type_, encoding = mimetypes.guess_type(filename)
            if encoding:
                return encoding
            if type_:
                return type_
            return "application/octet-stream"

        file_size = os.path.getsize(path)
        multipart_chunksize = s3_multipart.get_s3_multipart_chunk_size(
            file_size)
        tx_cfg = TransferConfig(
            multipart_threshold=s3_multipart.MULTIPART_THRESHOLD,
            multipart_chunksize=multipart_chunksize)
        s3 = boto3.resource("s3")

        destination_bucket = s3.Bucket(self.staging_bucket)
        with open(path, "rb") as file_handle, ChecksummingBufferedReader(
                file_handle, multipart_chunksize) as fh:
            key_name = "{}/{}".format(file_uuid, os.path.basename(fh.raw.name))
            destination_bucket.upload_fileobj(
                fh,
                key_name,
                Config=tx_cfg,
                ExtraArgs={
                    'ContentType':
                    content_type
                    if content_type is not None else _mime_type(fh.raw.name)
                })
            sums = fh.get_checksums()
            metadata = {
                "hca-dss-s3_etag": sums["s3_etag"],
                "hca-dss-sha1": sums["sha1"],
                "hca-dss-sha256": sums["sha256"],
                "hca-dss-crc32c": sums["crc32c"],
            }

            s3.meta.client.put_object_tagging(
                Bucket=destination_bucket.name,
                Key=key_name,
                Tagging=dict(TagSet=_encode_tags(metadata)))
        return file_uuid, key_name

    def _upload_tagged_cloud_file_to_dss_by_copy(self,
                                                 source_bucket: str,
                                                 source_key: str,
                                                 file_uuid: str,
                                                 bundle_uuid: str,
                                                 file_version: str = None,
                                                 timeout_seconds=1200):
        """
        Uploads a tagged file contained in a cloud bucket to the DSS by copy.
        This is typically used to update a tagged file from a staging bucket into the DSS.

        :param source_bucket: Name of an S3 bucket.  e.g. 'commons-dss-upload'
        :param source_key: S3 file to upload.  e.g. 'output.txt' or 'data/output.txt'
        :param file_uuid: An RFC4122-compliant UUID to be used to identify the file.
        :param bundle_uuid: An RFC4122-compliant UUID to be used to identify the bundle containing the file
        :param file_version: a RFC3339 compliant datetime string
        :param timeout_seconds:  Amount of time to continue attempting an async copy.
        :return: file_uuid: str, file_version: str, filename: str, file_present: bool
        """
        source_url = f"s3://{source_bucket}/{source_key}"
        filename = self.get_filename_from_key(source_key)

        if self.dry_run:
            logger.info(
                f"DRY RUN: _upload_tagged_cloud_file_to_dss: {source_bucket} {source_key} {file_uuid} {bundle_uuid}"
            )
            return file_uuid, file_version, filename

        request_parameters = dict(uuid=file_uuid,
                                  version=file_version,
                                  bundle_uuid=bundle_uuid,
                                  creator_uid=CREATOR_ID,
                                  source_url=source_url)
        if self.dry_run:
            print("DRY RUN: put file: " + str(request_parameters))
            return file_uuid, file_version, filename

        copy_start_time = time.time()
        response = self.dss_client.put_file._request(request_parameters)

        # the version we get back here is formatted in the way DSS likes
        # and we need this format update when doing load bundle
        file_version = response.json().get('version', "blank")

        # from dss swagger docs:
        # 200 Returned when the file is already present and is identical to the file being uploaded.
        already_present = response.status_code == requests.codes.ok
        if response.status_code == requests.codes.ok:
            logger.info("File %s: Already exists -> %s (%d seconds)",
                        source_url, file_version,
                        (time.time() - copy_start_time))
        elif response.status_code == requests.codes.created:
            logger.info("File %s: Sync copy -> %s (%d seconds)", source_url,
                        file_version, (time.time() - copy_start_time))
        elif response.status_code == requests.codes.accepted:
            logger.info("File %s: Starting async copy -> %s", source_url,
                        file_version)

            timeout = time.time() + timeout_seconds
            wait = 1.0
            # TODO: busy wait could hopefully be replaced with asyncio
            while time.time() < timeout:
                try:
                    self.dss_client.head_file(uuid=file_uuid,
                                              replica="aws",
                                              version=file_version)
                    logger.info(
                        "File %s: Finished async copy -> %s (approximately %d seconds)",
                        source_url, file_version,
                        (time.time() - copy_start_time))
                    break
                except SwaggerAPIException as e:
                    if e.code != requests.codes.not_found:
                        msg = "File {}: Unexpected server response during registration"
                        raise RuntimeError(msg.format(source_url))
                    time.sleep(wait)
                    wait = min(10.0,
                               wait * self.dss_client.UPLOAD_BACKOFF_FACTOR)
            else:
                # timed out. :(
                raise RuntimeError(
                    "File {}: registration FAILED".format(source_url))
            logger.debug("Successfully uploaded file")
        else:
            raise UnexpectedResponseError(
                f'Received unexpected response code {response.status_code}')

        return file_uuid, file_version, filename, already_present
Example #29
0
class BucketClientGCS(BucketClient):
    client: Optional[GCSNativeClient]

    def __init__(self, client: Optional[GCSNativeClient] = None):
        try:
            self.client = GCSNativeClient() if GCSNativeClient else None
        except (BaseException, DefaultCredentialsError):
            self.client = None

    def make_uri(self, path: PurePathy) -> str:
        return str(path)

    def create_bucket(self, path: PurePathy) -> Bucket:
        assert self.client is not None, _MISSING_DEPS
        return self.client.create_bucket(path.root)

    def delete_bucket(self, path: PurePathy) -> None:
        assert self.client is not None, _MISSING_DEPS
        bucket = self.client.get_bucket(path.root)
        bucket.delete()

    def exists(self, path: PurePathy) -> bool:
        # Because we want all the parents of a valid blob (e.g. "directory" in
        # "directory/foo.file") to return True, we enumerate the blobs with a prefix
        # and compare the object names to see if they match a substring of the path
        key_name = str(path.key)
        try:
            for obj in self.list_blobs(path):
                if obj.name == key_name:
                    return True
                if obj.name.startswith(key_name + path._flavour.sep):
                    return True
        except gcs_errors.ClientError:
            return False
        return False

    def lookup_bucket(self, path: PurePathy) -> Optional[BucketGCS]:
        assert self.client is not None, _MISSING_DEPS
        try:
            native_bucket = self.client.bucket(path.root)
            if native_bucket is not None:
                return BucketGCS(str(path.root), bucket=native_bucket)
        except gcs_errors.ClientError as err:
            print(err)

        return None

    def get_bucket(self, path: PurePathy) -> BucketGCS:
        assert self.client is not None, _MISSING_DEPS
        try:
            native_bucket = self.client.bucket(path.root)
            if native_bucket is not None:
                return BucketGCS(str(path.root), bucket=native_bucket)
            raise FileNotFoundError(f"Bucket {path.root} does not exist!")
        except gcs_errors.ClientError as e:
            raise ClientError(message=e.message, code=e.code)

    def list_buckets(
            self,
            **kwargs: Dict[str,
                           Any]) -> Generator[GCSNativeBucket, None, None]:
        assert self.client is not None, _MISSING_DEPS
        return self.client.list_buckets(**kwargs)  # type:ignore

    def scandir(  # type:ignore[override]
        self,
        path: Optional[PurePathy] = None,
        prefix: Optional[str] = None,
        delimiter: Optional[str] = None,
    ) -> Generator[BucketEntryGCS, None, None]:  # type:ignore[override]
        assert self.client is not None, _MISSING_DEPS
        continuation_token = None
        if path is None or not path.root:
            gcs_bucket: GCSNativeBucket
            for gcs_bucket in self.list_buckets():
                yield BucketEntryGCS(gcs_bucket.name, is_dir=True, raw=None)
            return
        sep = path._flavour.sep
        bucket = self.lookup_bucket(path)
        if bucket is None:
            return
        while True:
            if continuation_token:
                response = self.client.list_blobs(
                    bucket.name,
                    prefix=prefix,
                    delimiter=sep,
                    page_token=continuation_token,
                )
            else:
                response = self.client.list_blobs(bucket.name,
                                                  prefix=prefix,
                                                  delimiter=sep)
            for page in response.pages:
                for folder in list(page.prefixes):
                    full_name = folder[:-1] if folder.endswith(sep) else folder
                    name = full_name.split(sep)[-1]
                    if name:
                        yield BucketEntryGCS(name, is_dir=True, raw=None)
                for item in page:
                    name = item.name.split(sep)[-1]
                    if name:
                        yield BucketEntryGCS(
                            name=name,
                            is_dir=False,
                            size=item.size,
                            last_modified=item.updated.timestamp(),
                            raw=item,
                        )
            if response.next_page_token is None:
                break
            continuation_token = response.next_page_token

    def list_blobs(
        self,
        path: PurePathy,
        prefix: Optional[str] = None,
        delimiter: Optional[str] = None,
        include_dirs: bool = False,
    ) -> Generator[BlobGCS, None, None]:
        assert self.client is not None, _MISSING_DEPS
        continuation_token = None
        bucket = self.lookup_bucket(path)
        if bucket is None:
            return
        while True:
            if continuation_token:
                response = self.client.list_blobs(
                    path.root,
                    prefix=prefix,
                    delimiter=delimiter,
                    page_token=continuation_token,
                )
            else:
                response = self.client.list_blobs(path.root,
                                                  prefix=prefix,
                                                  delimiter=delimiter)
            for page in response.pages:
                for item in page:
                    yield BlobGCS(
                        bucket=bucket,
                        owner=item.owner,
                        name=item.name,
                        raw=item,
                        size=item.size,
                        updated=item.updated.timestamp(),
                    )
            if response.next_page_token is None:
                break
            continuation_token = response.next_page_token
Example #30
0
 def _upload_to_cloudstorage(self, local_path: str, blob_path: str) -> str:
     client = Client()
     bucket = client.bucket(self.BUCKET)
     blob = bucket.blob(blob_path)
     blob.upload_from_filename(local_path, timeout=3600)
     return blob_path