Exemple #1
0
    def _copy_to_ingest_bucket(
        self,
        path: str,
        full_file_upload_path: GcsfsFilePath,
    ) -> None:
        """Moves a file within GCS to the appropriate bucket if it has not already been deemed
        processed or discovered by the file metadata manager.

        We check both processed and discovered because a file may be discovered and awaiting to be
        ingested, so we will not re-upload. We check processed because a file may have already been
        ingested, but has been deleted from the bucket."""
        if not self.postgres_direct_ingest_file_metadata_manager.has_raw_file_been_discovered(
                full_file_upload_path
        ) and not self.postgres_direct_ingest_file_metadata_manager.has_raw_file_been_processed(
                full_file_upload_path):
            try:
                mimetype, _ = guess_type(os.path.basename(path))
                self.gcsfs.mv(
                    src_path=GcsfsFilePath.from_absolute_path(path),
                    dst_path=full_file_upload_path,
                )
                self.gcsfs.set_content_type(
                    full_file_upload_path,
                    mimetype if mimetype else "text/plain")
                logging.info("Copied %s -> %s", path,
                             full_file_upload_path.uri())
                self.uploaded_files.append(path)
            except BaseException as e:
                logging.warning(
                    "Could not copy %s -> %s due to error %s",
                    path,
                    full_file_upload_path.uri(),
                    e.args,
                )
                self.unable_to_upload_files.append(path)
        else:
            logging.info(
                "Skipping %s -> %s, due to %s already being processed",
                path,
                full_file_upload_path.uri(),
                full_file_upload_path.uri(),
            )
            self.skipped_files.append(path)
    def _copy_to_ingest_bucket(
        self,
        path: str,
        full_file_upload_path: GcsfsFilePath,
    ) -> None:
        if not self.dry_run:
            try:
                gsutil_cp(path, full_file_upload_path.uri())
                self.uploaded_files.append(path)
                self.copies_list.append((path, full_file_upload_path.uri()))
            except ValueError:
                self.unable_to_upload_files.append(path)
        else:
            self.copies_list.append((path, full_file_upload_path.uri()))

        with self.mutex:
            if self.move_progress:
                # pylint: disable=not-callable
                self.move_progress.next()
Exemple #3
0
 def _copy_to_ingest_bucket(self, path: str,
                            full_file_upload_path: GcsfsFilePath) -> None:
     try:
         mimetype, _ = guess_type(os.path.basename(path))
         self.gcsfs.mv(
             src_path=GcsfsFilePath.from_absolute_path(path),
             dst_path=full_file_upload_path,
         )
         self.gcsfs.set_content_type(full_file_upload_path,
                                     mimetype if mimetype else "text/plain")
         logging.info("Copied %s -> %s", path, full_file_upload_path.uri())
         self.uploaded_files.append(path)
     except BaseException as e:
         logging.warning(
             "Could not copy %s -> %s due to error %s",
             path,
             full_file_upload_path.uri(),
             e.args,
         )
         self.unable_to_upload_files.append(path)
    def _file_pointer_for_path(self, path: GcsfsFilePath, encoding: str):
        """Returns a file pointer for the given path."""

        # From the GCSFileSystem docs (https://gcsfs.readthedocs.io/en/latest/api.html#gcsfs.core.GCSFileSystem),
        # 'google_default' means we should look for local credentials set up via `gcloud login`. The project this is
        # reading from may have to match the project default you have set locally (check via `gcloud info` and set via
        # `gcloud config set project [PROJECT_ID]`. If we are running in the GAE environment, we should be able to query
        # the internal metadata for credentials.
        token = 'google_default' if not environment.in_gae() else 'cloud'
        return self.gcs_file_system.open(path.uri(),
                                         encoding=encoding,
                                         token=token)
 def open(
     self,
     path: GcsfsFilePath,
     chunk_size: Optional[int] = None,
     encoding: Optional[str] = None,
 ) -> Iterator[TextIO]:
     blob = self._get_blob(path)
     with blob.open("rb", chunk_size=chunk_size) as f:
         verifiable_reader = VerifiableBytesReader(f, name=path.uri())
         try:
             yield TextIOWrapper(buffer=verifiable_reader,
                                 encoding=encoding)
         finally:
             verifiable_reader.verify_crc32c(blob.crc32c)
    def _get_blob(self, path: GcsfsFilePath) -> storage.Blob:
        try:
            bucket = self.storage_client.bucket(path.bucket_name)
            blob = bucket.get_blob(path.blob_name)
        except NotFound as error:
            logging.warning(
                "Blob at [%s] does not exist - might have already been deleted",
                path.uri(),
            )

            raise GCSBlobDoesNotExistError(
                f"Blob at [{path.uri()}] does not exist") from error
        else:
            if not blob:
                logging.warning(
                    "Blob at [%s] does not exist - might have already been deleted",
                    path.uri(),
                )

                raise GCSBlobDoesNotExistError(
                    f"Blob at [{path.uri()}] does not exist")

            return blob
 def cache_ingest_file(self,
                       path: GcsfsFilePath,
                       csv_text: str,
                       separator: str = ",") -> None:
     self.fs.upload_from_string(path, csv_text, content_type="text/csv")
     response = self.test_client.post(
         "/data_discovery/cache_ingest_file_as_parquet_task",
         json={
             "gcs_file_uri": path.uri(),
             "file_encoding": "UTF-8",
             "file_separator": separator,
             "file_quoting": csv.QUOTE_MINIMAL,
         },
     )
     self.assertEqual(HTTPStatus.CREATED, response.status_code)
Exemple #8
0
def build_cache_ingest_file_as_parquet_task(
    gcs_file: GcsfsFilePath,
    separator: str,
    encoding: str,
    quoting: int,
    custom_line_terminator: Optional[str],
) -> Dict[str, Any]:
    body = {
        "gcs_file_uri": gcs_file.uri(),
        "file_separator": separator,
        "file_encoding": encoding,
        "file_quoting": quoting,
        "file_custom_line_terminator": custom_line_terminator,
    }
    if custom_line_terminator:
        body["file_custom_line_terminator"] = custom_line_terminator

    return {
        "relative_uri":
        "/admin/data_discovery/cache_ingest_file_as_parquet_task",
        "body": body,
    }