Ejemplo n.º 1
0
def get_importable_csvs(
    override_project_id: Optional[str] = None,
) -> Dict[str, GcsfsFilePath]:
    """Returns a map from view ids to GcsfsFilePaths where the views have
    been exported to CSVs."""
    gcsfs = GcsfsFactory.build()
    project_id = (
        metadata.project_id() if not override_project_id else override_project_id
    )

    files_in_import_folder = [
        f
        for f in gcsfs.ls_with_blob_prefix(
            bucket_name=f"{project_id}-case-triage-data",
            blob_prefix="to_import",
        )
        if isinstance(f, GcsfsFilePath)
    ]

    importable_csvs = {}

    for f in files_in_import_folder:
        root, ext = os.path.splitext(f.file_name)
        if ext != ".csv":
            continue
        importable_csvs[root] = f

    return importable_csvs
Ejemplo n.º 2
0
    def __init__(
        self,
        project_id: str,
        region: str,
        lower_bound_update_datetime: Optional[datetime.datetime],
        gcs_destination_path: Optional[str] = None,
    ):
        self.project_id = project_id
        self.region = region.lower()

        self.auth = SftpAuth.for_region(region)
        self.delegate = SftpDownloadDelegateFactory.build(region_code=region)
        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())

        self.unable_to_download_items: List[str] = []
        self.downloaded_items: List[Tuple[str, datetime.datetime]] = []

        self.lower_bound_update_datetime = lower_bound_update_datetime
        self.bucket = (
            GcsfsDirectoryPath.from_absolute_path(
                gcsfs_direct_ingest_directory_path_for_region(
                    region, SystemLevel.STATE, project_id=self.project_id
                )
            )
            if gcs_destination_path is None
            else GcsfsDirectoryPath.from_absolute_path(gcs_destination_path)
        )
        self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir(
            dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY
        )
Ejemplo n.º 3
0
    def __init__(
        self,
        paths_with_timestamps: List[Tuple[str, datetime.datetime]],
        project_id: str,
        region: str,
        delegate: UploadStateFilesToIngestBucketDelegate,
        destination_bucket_override: Optional[GcsfsBucketPath] = None,
    ):
        self.paths_with_timestamps = paths_with_timestamps
        self.project_id = project_id
        self.region = region.lower()
        self.delegate = delegate

        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())

        # Raw data uploads always default to primary ingest bucket
        self.destination_ingest_bucket = (
            destination_bucket_override
            or gcsfs_direct_ingest_bucket_for_region(
                region_code=region,
                system_level=SystemLevel.STATE,
                ingest_instance=DirectIngestInstance.PRIMARY,
                project_id=self.project_id,
            ))

        self.uploaded_files: List[str] = []
        self.skipped_files: List[str] = []
        self.unable_to_upload_files: List[str] = []
Ejemplo n.º 4
0
def read_batch_metadata(*, batch_id: str, state_code: StateCode) -> Dict[str, str]:
    gcsfs = GcsfsFactory.build()
    return json.loads(
        gcsfs.download_as_string(
            path=_gcsfs_path_for_batch_metadata(batch_id, state_code)
        )
    )
Ejemplo n.º 5
0
def normalize_raw_file_path() -> Tuple[str, HTTPStatus]:
    """Called from a Cloud Function when a new file is added to a bucket that is configured to rename files but not
    ingest them. For example, a bucket that is being used for automatic data transfer testing.
    """
    # The bucket name for the file to normalize
    bucket = get_str_param_value("bucket", request.args)
    # The relative path to the file, not including the bucket name
    relative_file_path = get_str_param_value("relative_file_path",
                                             request.args,
                                             preserve_case=True)

    if not bucket or not relative_file_path:
        return f"Bad parameters [{request.args}]", HTTPStatus.BAD_REQUEST

    path = GcsfsPath.from_bucket_and_blob_name(bucket_name=bucket,
                                               blob_name=relative_file_path)

    if not isinstance(path, GcsfsFilePath):
        raise ValueError(
            f"Incorrect type [{type(path)}] for path: {path.uri()}")

    fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
    fs.mv_path_to_normalized_path(path,
                                  file_type=GcsfsDirectIngestFileType.RAW_DATA)

    return "", HTTPStatus.OK
    def __init__(
        self,
        project_id: str,
        region: str,
        lower_bound_update_datetime: Optional[datetime.datetime],
        gcs_destination_path: Optional[GcsfsDirectoryPath] = None,
    ):
        self.project_id = project_id
        self.region = region.lower()

        self.auth = SftpAuth.for_region(region)
        self.delegate = SftpDownloadDelegateFactory.build(region_code=region)
        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())

        self.unable_to_download_items: List[str] = []
        self.downloaded_items: List[Tuple[str, datetime.datetime]] = []
        self.skipped_files: List[str] = []

        self.lower_bound_update_datetime = lower_bound_update_datetime
        self.bucket = (gcsfs_sftp_download_bucket_path_for_region(
            region, SystemLevel.STATE, project_id=self.project_id) if
                       gcs_destination_path is None else gcs_destination_path)
        self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir(
            dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY)

        self.postgres_direct_ingest_file_metadata_manager = (
            PostgresDirectIngestRawFileMetadataManager(
                region,
                DirectIngestInstance.PRIMARY.database_version(
                    SystemLevel.STATE,
                    state_code=StateCode(self.region.upper())).name,
            ))
Ejemplo n.º 7
0
    def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None,
                 max_delay_sec_between_files: Optional[int] = None):
        super().__init__(region_name, system_level)
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.max_delay_sec_between_files = max_delay_sec_between_files

        if not ingest_directory_path:
            ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        self.ingest_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(ingest_directory_path)

        if not storage_directory_path:
            storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.storage_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(storage_directory_path)

        self.temp_output_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path())

        ingest_job_file_type_filter = \
            GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None
        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self.get_file_tag_rank_list(),
                ingest_job_file_type_filter)

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code)

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl())

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()))
Ejemplo n.º 8
0
    def __init__(self, override_project_id: Optional[str] = None) -> None:
        self.gcs_fs = GcsfsFactory.build()
        self.override_project_id = override_project_id

        self.data_freshness_results: List[Dict[str, Union[str, bool]]] = []

        # This class takes heavy advantage of the fact that python dicts are thread-safe.
        self.store: InternalMetadataBackingStore = defaultdict(
            lambda: defaultdict(dict))
def scrape_aggregate_reports():
    """Calls state aggregates"""

    # Please add new states in alphabetical order
    state_to_scraper = {
        "california": ca_aggregate_site_scraper.get_urls_to_download,
        "colorado": co_aggregate_site_scraper.get_urls_to_download,
        "florida": fl_aggregate_site_scraper.get_urls_to_download,
        "georgia": ga_aggregate_site_scraper.get_urls_to_download,
        "hawaii": hi_aggregate_site_scraper.get_urls_to_download,
        "kentucky": ky_aggregate_site_scraper.get_urls_to_download,
        "new_york": ny_aggregate_site_scraper.get_urls_to_download,
        "tennessee": tn_aggregate_site_scraper.get_urls_to_download,
        "texas": tx_aggregate_site_scraper.get_urls_to_download,
        "west_virginia": wv_aggregate_site_scraper.get_urls_to_download,
    }
    state = get_str_param_value("state", request.args)
    # We want to always download the pdf if it is NY because they always have
    # the same name.
    always_download = state == "new_york"
    is_ca = state == "california"
    is_co = state == "colorado"
    verify_ssl = state != "kentucky"
    urls = state_to_scraper[state]()
    fs = GcsfsFactory.build()
    logging.info("Scraping all pdfs for %s", state)

    for url in urls:
        post_data = None
        if isinstance(url, tuple):
            url, post_data = url
            # We need to append the year of the report to create uniqueness in
            # the name since california sends post requests with the same url.
            pdf_name = state
            if is_ca:
                pdf_name += str(post_data["year"])
        elif is_co:
            pdf_name = date.today().strftime("colorado-%m-%Y")
        else:
            pdf_name = urlparse(url).path.replace("/", "_").lower()
        historical_path = build_path(HISTORICAL_BUCKET, state, pdf_name)
        file_to_upload = _get_file_to_upload(historical_path, fs, url,
                                             pdf_name, always_download,
                                             post_data, verify_ssl)
        if file_to_upload:
            upload_path = build_path(UPLOAD_BUCKET, state, pdf_name)
            fs.upload_from_contents_handle_stream(
                path=upload_path,
                contents_handle=file_to_upload,
                content_type="application/pdf",
            )
            logging.info("Successfully downloaded %s", url)
        else:
            logging.info("Skipping %s because the file already exists", url)

    return "", HTTPStatus.OK
Ejemplo n.º 10
0
    def __init__(self, ingest_bucket_path: GcsfsBucketPath) -> None:
        """Initialize the controller."""
        self.cloud_task_manager = DirectIngestCloudTaskManagerImpl()
        self.ingest_instance = DirectIngestInstance.for_ingest_bucket(
            ingest_bucket_path)
        self.region_lock_manager = DirectIngestRegionLockManager.for_direct_ingest(
            region_code=self.region.region_code,
            schema_type=self.system_level.schema_type(),
            ingest_instance=self.ingest_instance,
        )
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.ingest_bucket_path = ingest_bucket_path
        self.storage_directory_path = (
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code=self.region_code(),
                system_level=self.system_level,
                ingest_instance=self.ingest_instance,
            ))

        self.temp_output_directory_path = (
            gcsfs_direct_ingest_temporary_output_directory_path())

        self.file_prioritizer = GcsfsDirectIngestJobPrioritizer(
            self.fs,
            self.ingest_bucket_path,
            self.get_file_tag_rank_list(),
        )

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code,
            ingest_database_name=self.ingest_database_key.db_name,
        )

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_bucket_path=self.ingest_bucket_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl(),
        )

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            output_bucket_name=self.ingest_bucket_path.bucket_name,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()),
            launched_file_tags=self.get_file_tag_rank_list(),
        )

        self.ingest_instance_status_manager = DirectIngestInstanceStatusManager(
            self.region_code(), self.ingest_instance)
    def for_schema_type(
        cls,
        schema_type: SchemaType,
        yaml_path: Optional[GcsfsFilePath] = None
    ) -> Optional["CloudSqlToBQConfig"]:
        """Logic for instantiating a config object for a schema type."""
        gcs_fs = GcsfsFactory.build()
        if not yaml_path:
            yaml_path = GcsfsFilePath.from_absolute_path(
                f"gs://{cls._get_project_id()}-configs/cloud_sql_to_bq_config.yaml"
            )
        yaml_string = gcs_fs.download_as_string(yaml_path)
        try:
            yaml_config = yaml.safe_load(yaml_string)
        except yaml.YAMLError as e:
            raise ValueError(
                f"Could not parse YAML in [{yaml_path.abs_path()}]") from e

        if schema_type == SchemaType.JAILS:
            return CloudSqlToBQConfig(
                metadata_base=base_schema.JailsBase,
                schema_type=SchemaType.JAILS,
                dataset_id=county_dataset_config.COUNTY_BASE_DATASET,
                columns_to_exclude=yaml_config.get("county_columns_to_exclude",
                                                   {}),
            )
        if schema_type == SchemaType.JUSTICE_COUNTS:
            # Justice Counts views currently rely on federated queries directly to Postgres instead of this refresh.
            # TODO(#5081): Re-enable this once arrays are removed from the Justice Counts schema.
            return None
        if schema_type == SchemaType.CASE_TRIAGE:
            # Case Triage does not need to be exported to BigQuery
            return None
        if schema_type == SchemaType.OPERATIONS:
            return CloudSqlToBQConfig(
                metadata_base=base_schema.OperationsBase,
                schema_type=SchemaType.OPERATIONS,
                region_codes_to_exclude=yaml_config.get(
                    "region_codes_to_exclude", []),
                dataset_id=operations_dataset_config.OPERATIONS_BASE_DATASET,
            )
        if schema_type == SchemaType.STATE:
            return CloudSqlToBQConfig(
                metadata_base=base_schema.StateBase,
                schema_type=SchemaType.STATE,
                dataset_id=state_dataset_config.STATE_BASE_DATASET,
                region_codes_to_exclude=yaml_config.get(
                    "region_codes_to_exclude", []),
                history_tables_to_include=yaml_config.get(
                    "state_history_tables_to_include", []),
            )

        raise ValueError(f"Unexpected schema type value [{schema_type}]")
Ejemplo n.º 12
0
def ingest() -> Tuple[str, HTTPStatus]:
    manifest_path = get_str_param_value("manifest_path",
                                        request.args,
                                        preserve_case=True)

    if not manifest_path:
        raise exceptions.BadRequest("Parameter `manifest_path` is required.")

    manual_upload.ingest(GcsfsFactory.build(),
                         GcsfsFilePath.from_absolute_path(manifest_path))

    return "", HTTPStatus.OK
Ejemplo n.º 13
0
    def for_schema_type(
        cls,
        schema_type: SchemaType,
        direct_ingest_instance: Optional[DirectIngestInstance] = None,
        yaml_path: Optional[GcsfsFilePath] = None,
    ) -> "CloudSqlToBQConfig":
        """Logic for instantiating a config object for a schema type."""
        if not cls.is_valid_schema_type(schema_type):
            raise ValueError(f"Unsupported schema_type: [{schema_type}]")

        if schema_type != SchemaType.STATE and direct_ingest_instance is not None:
            raise ValueError(
                "CloudSQLToBQConfig can only be initialized with DirectIngestInstance with STATE schema."
            )

        gcs_fs = GcsfsFactory.build()
        if not yaml_path:
            yaml_path = cls.default_config_path()
        yaml_string = gcs_fs.download_as_string(yaml_path)
        try:
            yaml_config = yaml.safe_load(yaml_string)
        except yaml.YAMLError as e:
            raise ValueError(
                f"Could not parse YAML in [{yaml_path.abs_path()}]") from e

        if schema_type == SchemaType.JAILS:
            return CloudSqlToBQConfig(
                schema_type=SchemaType.JAILS,
                columns_to_exclude=yaml_config.get("county_columns_to_exclude",
                                                   {}),
            )
        if schema_type == SchemaType.OPERATIONS:
            return CloudSqlToBQConfig(
                schema_type=SchemaType.OPERATIONS,
                region_codes_to_exclude=yaml_config.get(
                    "region_codes_to_exclude", []),
            )
        if schema_type == SchemaType.STATE:
            if direct_ingest_instance is None:
                direct_ingest_instance = DirectIngestInstance.PRIMARY
            return CloudSqlToBQConfig(
                schema_type=SchemaType.STATE,
                direct_ingest_instance=direct_ingest_instance,
                region_codes_to_exclude=yaml_config.get(
                    "region_codes_to_exclude", []),
                history_tables_to_include=yaml_config.get(
                    "state_history_tables_to_include", []),
            )
        if schema_type == SchemaType.CASE_TRIAGE:
            return CloudSqlToBQConfig(schema_type=SchemaType.CASE_TRIAGE)

        raise ValueError(f"Unexpected schema type value [{schema_type}]")
Ejemplo n.º 14
0
def _write_batch_metadata(
    *,
    batch_id: str,
    state_code: StateCode,
    report_type: ReportType,
    **metadata_fields: str,
) -> None:
    gcsfs = GcsfsFactory.build()
    gcsfs.upload_from_string(
        path=_gcsfs_path_for_batch_metadata(batch_id, state_code),
        contents=json.dumps({**metadata_fields, "report_type": report_type.value}),
        content_type="text/json",
    )
Ejemplo n.º 15
0
def get_local_file(file_path: GcsfsFilePath) -> str:
    """
    Helper function for supporting local development flows.
    When in development environments, we fetch file contents from `recidiviz/case_triage/local/gcs`
    In Google Cloud environments, we delegate to Cloud Storage
    """

    if in_development():
        return Path(os.path.join(local_path, "gcs",
                                 file_path.abs_path())).read_text()

    gcs_fs = GcsfsFactory.build()
    return gcs_fs.download_as_string(file_path)
Ejemplo n.º 16
0
def _retrieve_data_for_po_monthly_report(
    state_code: StateCode, batch_id: str
) -> List[Recipient]:
    """Retrieves the data if the report type is POMonthlyReport."""
    data_bucket = utils.get_data_storage_bucket_name()
    data_filename = ""
    gcs_file_system = GcsfsFactory.build()
    try:
        data_filename = utils.get_data_filename(state_code, ReportType.POMonthlyReport)
        path = GcsfsFilePath.from_absolute_path(f"gs://{data_bucket}/{data_filename}")
        file_contents = gcs_file_system.download_as_string(path)
    except BaseException:
        logging.info("Unable to load data file %s/%s", data_bucket, data_filename)
        raise

    archive_bucket = utils.get_data_archive_bucket_name()
    archive_filename = ""
    try:
        archive_filename = utils.get_data_archive_filename(batch_id, state_code)
        archive_path = GcsfsFilePath.from_absolute_path(
            f"gs://{archive_bucket}/{archive_filename}"
        )
        gcs_file_system.upload_from_string(
            path=archive_path, contents=file_contents, content_type="text/json"
        )
    except Exception:
        logging.error(
            "Unable to archive the data file to %s/%s", archive_bucket, archive_filename
        )
        raise

    json_list = file_contents.splitlines()

    recipient_data: List[dict] = []
    for json_str in json_list:
        try:
            item = json.loads(json_str)
        except Exception as err:
            logging.error(
                "Unable to parse JSON found in the file %s. Offending json string is: '%s'. <%s> %s",
                data_filename,
                json_str,
                type(err).__name__,
                err,
            )
        else:
            if email := item.get("email_address"):
                mismatches = _get_mismatch_data_for_officer(email)
                if mismatches is not None:
                    item["mismatches"] = mismatches
            recipient_data.append(item)
Ejemplo n.º 17
0
    def __init__(
        self,
        dataset_nickname: str,
        metadata_file_prefix: str,
        override_project_id: Optional[str] = None,
    ) -> None:
        super().__init__(override_project_id)
        self.gcs_fs = GcsfsFactory.build()
        self.dataset_nickname = dataset_nickname
        self.metadata_file_prefix = metadata_file_prefix

        # This class takes heavy advantage of the fact that python dicts are thread-safe.
        self.store: InternalMetadataBackingStore = defaultdict(
            lambda: defaultdict(dict))
Ejemplo n.º 18
0
def do_metric_export_for_configs(
    export_configs: Dict[str, Sequence[ExportBigQueryViewConfig]],
    state_code_filter: Optional[str],
    override_view_exporter: Optional[BigQueryViewExporter] = None,
) -> None:
    """Triggers the export given the export_configs."""

    gcsfs_client = GcsfsFactory.build()
    delegate_export_map = get_delegate_export_map(gcsfs_client,
                                                  override_view_exporter)

    for export_name, view_export_configs in export_configs.items():
        export_log_message = f"Starting [{export_name}] export"
        export_log_message += (f" for state_code [{state_code_filter}]."
                               if state_code_filter else ".")

        logging.info(export_log_message)

        # The export will error if the validations fail for the set of view_export_configs. We want to log this failure
        # as a warning, but not block on the rest of the exports.
        try:
            export_views_with_exporters(gcsfs_client, view_export_configs,
                                        delegate_export_map)
        except ViewExportValidationError as e:
            warning_message = f"Export validation failed for {export_name}"

            if state_code_filter:
                warning_message += f" for state: {state_code_filter}"

            logging.warning("%s\n%s", warning_message, str(e))
            with monitoring.measurements({
                    monitoring.TagKey.METRIC_VIEW_EXPORT_NAME:
                    export_name,
                    monitoring.TagKey.REGION:
                    state_code_filter,
            }) as measurements:
                measurements.measure_int_put(m_failed_metric_export_validation,
                                             1)

            # Do not treat validation failures as fatal errors
            continue
        except Exception as e:
            with monitoring.measurements({
                    monitoring.TagKey.METRIC_VIEW_EXPORT_NAME:
                    export_name,
                    monitoring.TagKey.REGION:
                    state_code_filter,
            }) as measurements:
                measurements.measure_int_put(m_failed_metric_export_job, 1)
            raise e
Ejemplo n.º 19
0
def ingest() -> Tuple[str, HTTPStatus]:
    manifest_path = get_str_param_value("manifest_path",
                                        request.args,
                                        preserve_case=True)

    if not manifest_path:
        return "Parameter `manifest_path` is required.", HTTPStatus.BAD_REQUEST

    try:
        manual_upload.ingest(GcsfsFactory.build(),
                             GcsfsFilePath.from_absolute_path(manifest_path))
    except Exception as e:
        return f"Error ingesting data: '{e}'", HTTPStatus.INTERNAL_SERVER_ERROR

    return "", HTTPStatus.OK
Ejemplo n.º 20
0
    def _cache_ingest_file_as_parquet_task() -> Tuple[str, int]:
        """Downloads a GCS file and stores it to our Redis cache in Parquet format

         Example:
             POST /admin/data_discovery/cache_ingest_file_as_parquet_task
         Request Body:
             gcs_file_uri: (string) The `gs://` URI of the file to cache
             file_encoding: (string) The encoding of said file
             file_separator: (string) The value delimiter of side file
             file_quoting: (int) A `csv.QUOTE_*` value for the parser i.e. 3 (csv.QUOTE_NONE)
        Args:
             N/A
         Returns:
             Cache hit/miss result
        """
        cache = get_data_discovery_cache()
        body = get_cloud_task_json_body()
        path = GcsfsFilePath.from_absolute_path(body["gcs_file_uri"])
        parquet_path = SingleIngestFileParquetCache.parquet_cache_key(path)

        if not cache.exists(parquet_path):
            fs = GcsfsFactory.build()
            parquet_cache = SingleIngestFileParquetCache(
                get_data_discovery_cache(), path, expiry=DataDiscoveryTTL.PARQUET_FILES
            )
            csv_reader = GcsfsCsvReader(fs)
            csv_reader.streaming_read(
                path,
                CacheIngestFileAsParquetDelegate(parquet_cache, path),
                encodings_to_try=list(
                    {
                        body["file_encoding"],
                        *COMMON_RAW_FILE_ENCODINGS,
                    }
                ),
                delimiter=body["file_separator"],
                quoting=body["file_quoting"],
                lineterminator=body.get("file_custom_line_terminator"),
                chunk_size=75000,
                index_col=False,
                keep_default_na=False,
            )

            return CACHE_MISS, HTTPStatus.CREATED

        return CACHE_HIT, HTTPStatus.OK
Ejemplo n.º 21
0
def load_files_from_storage(bucket_name: str,
                            batch_id_path: str) -> Dict[str, str]:
    """Loads the files for this batch and bucket name from Cloud Storage.

    This function is guaranteed to either return a dictionary with 1 or more results or throw an exception if there is
    a problem loading any of the files.

    Args:
        bucket_name: The bucket name to find the batch_id files
        batch_id_path: The path, containing the state_code, to the identifier for this batch

    Returns:
        A dict whose keys are the email addresses of recipients and values are strings of the email body or file
        attachment content to send.

    Raises:
        Passes through exceptions from Storage and raises its own if there are no results in this batch.
    """
    try:
        gcs_file_system = GcsfsFactory.build()
        paths = [
            path for path in gcs_file_system.ls_with_blob_prefix(
                bucket_name, blob_prefix=batch_id_path)
            if isinstance(path, GcsfsFilePath)
        ]
    except Exception:
        logging.error(
            "Unable to list files in folder. Bucket = %s, batch_id = %s",
            bucket_name,
            batch_id_path,
        )
        raise

    files = {}
    for path in paths:
        try:
            body = gcs_file_system.download_as_string(path)
        except Exception:
            logging.error("Unable to load file %s from bucket %s",
                          path.blob_name, bucket_name)
            raise
        else:
            email_address = email_from_file_name(path.file_name)
            files[email_address] = body

    return files
Ejemplo n.º 22
0
def upload_file_contents_to_gcs(file_path: GcsfsFilePath, file_contents: str,
                                content_type: str) -> None:
    """Uploads a file's content to Cloud Storage.

    Args:
        file_path: The GCS path to write to
        file_contents: The content to upload to the Cloud Storage file path.
        content_type: The content type for the file that will be uploaded to Cloud Storage.
    """
    try:
        gcs_file_system = GcsfsFactory.build()
        gcs_file_system.upload_from_string(path=file_path,
                                           contents=file_contents,
                                           content_type=content_type)
    except Exception:
        logging.error("Error while attempting upload of %s", file_path)
        raise
Ejemplo n.º 23
0
    def _initialize_stores(self) -> None:
        self.ingest_metadata_store = DatasetMetadataCountsStore(
            _INGEST_METADATA_NICKNAME,
            _INGEST_METADATA_PREFIX,
        )

        self.validation_metadata_store = DatasetMetadataCountsStore(
            _VALIDATION_METADATA_NICKNAME,
            _VALIDATION_METADATA_PREFIX,
        )

        self.ingest_data_freshness_store = IngestDataFreshnessStore()

        self.ingest_operations_store = IngestOperationsStore()

        self.validation_status_store = ValidationStatusStore()

        self.monthly_reports_gcsfs = GcsfsFactory.build()
Ejemplo n.º 24
0
def state_aggregate() -> Tuple[str, HTTPStatus]:
    """Calls state aggregates"""
    bucket = get_str_param_value("bucket", request.args)
    state = get_str_param_value("state", request.args)
    filename = get_str_param_value("filename", request.args)
    project_id = metadata.project_id()
    logging.info("The project id is %s", project_id)
    if not bucket or not state or not filename:
        raise StateAggregateError("All of state, bucket, and filename must be provided")
    directory_path = GcsfsDirectoryPath(bucket, state)
    path = GcsfsFilePath.from_directory_and_file_name(directory_path, filename)
    parser = STATE_TO_PARSER[state]
    fs = GcsfsFactory.build()
    logging.info("The path to download from is %s", path)

    logging.info("The files in the directory are:")
    logging.info(
        fs.ls_with_blob_prefix(
            bucket_name=directory_path.bucket_name,
            blob_prefix=directory_path.relative_path,
        )
    )

    # Providing a stream buffer to tabula reader does not work because it
    # tries to load the file into the local filesystem, since appengine is a
    # read only filesystem (except for the tmpdir) we download the file into
    # the local tmpdir and pass that in.
    handle = fs.download_to_temp_file(path)
    if not handle:
        raise StateAggregateError(f"Unable to download file: {path}")
    logging.info("Successfully downloaded file from gcs: %s", handle.local_file_path)

    result = parser(handle.local_file_path)
    logging.info("Successfully parsed the report")
    for table, df in result.items():
        dao.write_df(table, df)

    # If we are successful, we want to move the file out of the cloud
    # function triggered directory, and into the historical path.
    historical_path = GcsfsFilePath.from_directory_and_file_name(
        GcsfsDirectoryPath(HISTORICAL_BUCKET.format(project_id), state), filename
    )
    fs.mv(path, historical_path)
    return "", HTTPStatus.OK
def main(*, project_id: str, local_filepath: str, backfill: bool) -> None:
    """If filepath, uploads file at filepath to Google Cloud Storage and adds rows that
     do not already exist in BigQuery table.
    If backfill, loads all rows in CSVs in GCS to BQ table.
    """

    fs = GcsfsFactory.build()
    bq_client = BigQueryClientImpl(project_id=project_id)

    # If backfill, clear out all data in BQ table and load in new rows
    if backfill and (input(
            "Are you sure? This action will delete all data from current raw data "
            "table and replace it with rows from CSVs currently in GCS. "
            "Enter 'backfill' if you are sure. \n") == "backfill"):
        # Clear out old rows from table
        bq_client.delete_from_table_async(
            dataset_id=DATASET_ID,
            table_id=FINAL_DESTINATION_TABLE,
            filter_clause="WHERE TRUE",
        )
        # For each file in table, load into BQ
        for blob in fs.ls_with_blob_prefix(f"{project_id}{BUCKET_SUFFIX}", ""):
            if isinstance(blob, GcsfsFilePath):
                logging.info(
                    "Back filling from blob [%s] in bucket [%s]",
                    blob.file_name,
                    f"{project_id}{BUCKET_SUFFIX}",
                )
                load_from_gcs_to_temp_table(bq_client, project_id,
                                            blob.file_name)
                load_from_temp_to_permanent_table(bq_client, project_id)

    # If local file path was provided, upload that file to GCS and load data into BQ
    if local_filepath:
        # If local file path provided, upload file at file path into GCS
        upload_raw_file_to_gcs(fs, local_filepath,
                               f"{project_id}{BUCKET_SUFFIX}")
        logging.info("Found local file path, uploading from [%s]",
                     local_filepath)

        # Load data to temporary table and then to permanent
        load_from_gcs_to_temp_table(bq_client, project_id,
                                    date.today().strftime(DATE_FORMAT))
        load_from_temp_to_permanent_table(bq_client, project_id)
Ejemplo n.º 26
0
    def __init__(
        self,
        paths_with_timestamps: List[Tuple[str, datetime.datetime]],
        project_id: str,
        region: str,
        gcs_destination_path: Optional[str] = None,
    ):
        self.paths_with_timestamps = paths_with_timestamps
        self.project_id = project_id
        self.region = region.lower()

        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.gcs_destination_path = (
            GcsfsDirectoryPath.from_absolute_path(
                gcsfs_direct_ingest_directory_path_for_region(
                    region, SystemLevel.STATE, project_id=self.project_id))
            if gcs_destination_path is None else
            GcsfsDirectoryPath.from_absolute_path(gcs_destination_path))
        self.uploaded_files: List[str] = []
        self.unable_to_upload_files: List[str] = []
Ejemplo n.º 27
0
    def __init__(
        self,
        *,
        state_code: StateCode,
        sandbox_dataset_prefix: str,
        test_ingest_bucket: GcsfsBucketPath,
    ):

        check_is_valid_sandbox_bucket(test_ingest_bucket)

        super().__init__(
            region=get_region(state_code.value.lower(), is_direct_ingest=True),
            fs=DirectIngestGCSFileSystem(GcsfsFactory.build()),
            ingest_bucket_path=test_ingest_bucket,
            temp_output_directory_path=GcsfsDirectoryPath.from_dir_and_subdir(
                test_ingest_bucket, "temp_raw_data"
            ),
            big_query_client=BigQueryClientImpl(),
        )
        self.sandbox_dataset = (
            f"{sandbox_dataset_prefix}_{super()._raw_tables_dataset()}"
        )
Ejemplo n.º 28
0
def export_view_data_to_cloud_storage(
    export_job_filter: str,
    override_view_exporter: Optional[BigQueryViewExporter] = None,
) -> None:
    """Exports data in BigQuery metric views to cloud storage buckets.

    Optionally takes in a BigQueryViewExporter for performing the export operation. If none is provided, this defaults
    to using a CompositeBigQueryViewExporter with delegates of JsonLinesBigQueryViewExporter and
    OptimizedMetricBigQueryViewExporter.
    """
    export_configs_for_filter: List[ExportViewCollectionConfig] = []
    bq_view_namespaces_to_update: Set[BigQueryViewNamespace] = set()
    for dataset_export_config in export_config.VIEW_COLLECTION_EXPORT_CONFIGS:
        if not dataset_export_config.matches_filter(export_job_filter):
            logging.info(
                "Skipped metric export for config [%s] with filter [%s]",
                dataset_export_config,
                export_job_filter,
            )
            continue

        export_configs_for_filter.append(dataset_export_config)
        bq_view_namespaces_to_update.add(
            dataset_export_config.bq_view_namespace)

    if not export_configs_for_filter:
        raise ValueError("Export filter did not match any export configs: ",
                         export_job_filter)

    for bq_view_namespace_to_update in bq_view_namespaces_to_update:
        view_builders_for_views_to_update = (
            view_update_manager.
            VIEW_BUILDERS_BY_NAMESPACE[bq_view_namespace_to_update])

        # TODO(#5125): Once view update is consistently trivial, always update all views in namespace
        if (bq_view_namespace_to_update
                in export_config.NAMESPACES_REQUIRING_FULL_UPDATE):
            view_update_manager.create_dataset_and_deploy_views_for_view_builders(
                bq_view_namespace_to_update, view_builders_for_views_to_update)

        # The view deploy will only have rematerialized views that had been updated since the last deploy, this call
        # will ensure that all materialized tables get refreshed.
        view_update_manager.rematerialize_views_for_namespace(
            bq_view_namespace=bq_view_namespace_to_update,
            candidate_view_builders=view_builders_for_views_to_update,
        )

    gcsfs_client = GcsfsFactory.build()
    if override_view_exporter is None:
        bq_client = BigQueryClientImpl()

        # Some our views intentionally export empty files (e.g. some of the ingest_metadata views)
        # so we just check for existence
        csv_exporter = CSVBigQueryViewExporter(
            bq_client, ExistsBigQueryViewExportValidator(gcsfs_client))
        json_exporter = JsonLinesBigQueryViewExporter(
            bq_client, ExistsBigQueryViewExportValidator(gcsfs_client))
        metric_exporter = OptimizedMetricBigQueryViewExporter(
            bq_client,
            OptimizedMetricBigQueryViewExportValidator(gcsfs_client))

        delegate_export_map = {
            ExportOutputFormatType.CSV: csv_exporter,
            ExportOutputFormatType.HEADERLESS_CSV: csv_exporter,
            ExportOutputFormatType.JSON: json_exporter,
            ExportOutputFormatType.METRIC: metric_exporter,
        }
    else:
        delegate_export_map = {
            ExportOutputFormatType.CSV: override_view_exporter,
            ExportOutputFormatType.HEADERLESS_CSV: override_view_exporter,
            ExportOutputFormatType.JSON: override_view_exporter,
            ExportOutputFormatType.METRIC: override_view_exporter,
        }

    project_id = metadata.project_id()

    for dataset_export_config in export_configs_for_filter:
        logging.info(
            "Starting metric export for dataset_config [%s] with filter [%s]",
            dataset_export_config,
            export_job_filter,
        )

        view_export_configs = dataset_export_config.export_configs_for_views_to_export(
            project_id=project_id)

        # The export will error if the validations fail for the set of view_export_configs. We want to log this failure
        # as a warning, but not block on the rest of the exports.
        try:
            export_views_with_exporters(gcsfs_client, view_export_configs,
                                        delegate_export_map)
        except ViewExportValidationError:
            warning_message = (
                f"Export validation failed for {dataset_export_config.export_name}"
            )

            if dataset_export_config.state_code_filter is not None:
                warning_message += (
                    f" for state: {dataset_export_config.state_code_filter}")

            logging.warning(warning_message)
            with monitoring.measurements({
                    monitoring.TagKey.METRIC_VIEW_EXPORT_NAME:
                    dataset_export_config.export_name,
                    monitoring.TagKey.REGION:
                    dataset_export_config.state_code_filter,
            }) as measurements:
                measurements.measure_int_put(m_failed_metric_export_validation,
                                             1)

            # Do not treat validation failures as fatal errors
            continue
        except Exception as e:
            with monitoring.measurements({
                    monitoring.TagKey.METRIC_VIEW_EXPORT_NAME:
                    dataset_export_config.export_name,
                    monitoring.TagKey.REGION:
                    dataset_export_config.state_code_filter,
            }) as measurements:
                measurements.measure_int_put(m_failed_metric_export_job, 1)
            raise e
Ejemplo n.º 29
0
 def __init__(self, project_id: Optional[str] = None):
     if not project_id:
         project_id = metadata.project_id()
     self.fs = GcsfsFactory.build()
     self.bucket_name = f"{project_id}-gcslock"
Ejemplo n.º 30
0
 def __init__(self, ingest_bucket_path: GcsfsBucketPath):
     super().__init__(ingest_bucket_path)
     self.csv_reader = GcsfsCsvReader(GcsfsFactory.build())