def _fetch(
     self,
     connection: pysftp.Connection,
     file_path: str,
     file_timestamp: datetime.datetime,
 ) -> None:
     """Fetches data files from the SFTP, tracking which items downloaded and failed to download."""
     normalized_sftp_path = os.path.normpath(file_path)
     normalized_upload_path = GcsfsFilePath.from_directory_and_file_name(
         dir_path=self.download_dir,
         file_name=os.path.basename(
             to_normalized_unprocessed_file_path(
                 normalized_sftp_path,
                 file_type=GcsfsDirectIngestFileType.RAW_DATA,
                 dt=file_timestamp,
             )),
     )
     if not self.postgres_direct_ingest_file_metadata_manager.has_raw_file_been_discovered(
             normalized_upload_path
     ) and not self.postgres_direct_ingest_file_metadata_manager.has_raw_file_been_processed(
             normalized_upload_path):
         logging.info("Downloading %s into %s", normalized_sftp_path,
                      self.download_dir)
         try:
             path = GcsfsFilePath.from_directory_and_file_name(
                 dir_path=self.download_dir, file_name=normalized_sftp_path)
             self.gcsfs.upload_from_contents_handle_stream(
                 path=path,
                 contents_handle=GcsfsSftpFileContentsHandle(
                     sftp_connection=connection, local_file_path=file_path),
                 content_type=BYTES_CONTENT_TYPE,
             )
             logging.info("Post processing %s", path.uri())
             self.downloaded_items.append((
                 self.delegate.post_process_downloads(path, self.gcsfs),
                 file_timestamp,
             ))
         except IOError as e:
             logging.info(
                 "Could not download %s into %s: %s",
                 normalized_sftp_path,
                 self.download_dir,
                 e.args,
             )
             self.unable_to_download_items.append(file_path)
     else:
         logging.info(
             "Skipping downloading %s because it has already been previously downloaded for ingest.",
             normalized_sftp_path,
         )
         self.skipped_files.append(file_path)
def build_path(bucket_template: str, state: str,
               pdf_name: str) -> GcsfsFilePath:
    return GcsfsFilePath.from_directory_and_file_name(
        GcsfsDirectoryPath(bucket_template.format(metadata.project_id()),
                           state),
        pdf_name,
    )
    def _create_split_file_path(
        self,
        original_file_path: GcsfsFilePath,
        output_dir: GcsfsDirectoryPath,
        split_num: int,
    ) -> GcsfsFilePath:
        parts = filename_parts_from_path(original_file_path)

        rank_str = str(split_num + 1).zfill(5)
        updated_file_name = (
            f"{parts.stripped_file_name}_{rank_str}"
            f"_{SPLIT_FILE_SUFFIX}_size{self.ingest_file_split_line_limit}"
            f".{parts.extension}")

        file_type = (
            GcsfsDirectIngestFileType.INGEST_VIEW
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else
            GcsfsDirectIngestFileType.UNSPECIFIED)

        return GcsfsFilePath.from_directory_and_file_name(
            output_dir,
            to_normalized_unprocessed_file_path(updated_file_name,
                                                file_type=file_type,
                                                dt=parts.utc_upload_datetime),
        )
    def _move_files(self, from_uri: str):
        curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri)
        previous_date_format = filename_parts_from_path(
            curr_gcsfs_file_path).date_str
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")

        path_with_new_file_name = GcsfsFilePath.from_absolute_path(
            to_normalized_unprocessed_file_path_from_normalized_path(
                from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path):
            path_with_new_file_name = GcsfsFilePath.from_absolute_path(
                to_normalized_processed_file_path_from_normalized_path(
                    from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir(
            self.region_storage_raw_dir_path, new_date_format)

        to_uri = GcsfsFilePath.from_directory_and_file_name(
            raw_dir_with_date, path_with_new_file_name.file_name).uri()

        if not self.dry_run:
            gsutil_mv(from_path=from_uri, to_path=to_uri)
        with self.mutex:
            self.move_list.append((from_uri, to_uri))
            if self.move_progress:
                self.move_progress.next()
def path_for_fixture_file_in_test_gcs_directory(
    *,
    bucket_path: GcsfsBucketPath,
    filename: str,
    should_normalize: bool,
    file_type: Optional[GcsfsDirectIngestFileType],
    dt: Optional[datetime.datetime] = None,
) -> GcsfsFilePath:
    file_path_str = filename

    if should_normalize:
        if not file_type:
            raise ValueError(
                "Expected file_type for path normalization but got None")
        file_path_str = to_normalized_unprocessed_file_path(
            original_file_path=file_path_str, file_type=file_type, dt=dt)

    file_path = GcsfsFilePath.from_directory_and_file_name(
        dir_path=bucket_path,
        file_name=file_path_str,
    )
    if not isinstance(file_path, GcsfsFilePath):
        raise ValueError(
            f"Expected type GcsfsFilePath, found {type(file_path)} for path: {file_path.abs_path()}"
        )
    return file_path
Exemple #6
0
 def _fetch(
     self,
     connection: pysftp.Connection,
     file_path: str,
     file_timestamp: datetime.datetime,
 ) -> None:
     """Fetches data files from the SFTP, tracking which items downloaded and failed to download."""
     normalized_sftp_path = os.path.normpath(file_path)
     logging.info("Downloading %s into %s", normalized_sftp_path, self.download_dir)
     try:
         path = GcsfsFilePath.from_directory_and_file_name(
             dir_path=self.download_dir, file_name=normalized_sftp_path
         )
         self.gcsfs.upload_from_contents_handle_stream(
             path=path,
             contents_handle=GcsfsSftpFileContentsHandle(
                 sftp_connection=connection, local_file_path=file_path
             ),
             content_type=BYTES_CONTENT_TYPE,
         )
         logging.info("Post processing %s", path.uri())
         self.downloaded_items.append(
             (self.delegate.post_process_downloads(path, self.gcsfs), file_timestamp)
         )
     except IOError as e:
         logging.info(
             "Could not download %s into %s: %s",
             normalized_sftp_path,
             self.download_dir,
             e.args,
         )
         self.unable_to_download_items.append(file_path)
Exemple #7
0
def state_aggregate() -> Tuple[str, HTTPStatus]:
    """Calls state aggregates"""
    bucket = get_str_param_value("bucket", request.args)
    state = get_str_param_value("state", request.args)
    filename = get_str_param_value("filename", request.args)
    project_id = metadata.project_id()
    logging.info("The project id is %s", project_id)
    if not bucket or not state or not filename:
        raise StateAggregateError("All of state, bucket, and filename must be provided")
    directory_path = GcsfsDirectoryPath(bucket, state)
    path = GcsfsFilePath.from_directory_and_file_name(directory_path, filename)
    parser = STATE_TO_PARSER[state]
    fs = GcsfsFactory.build()
    logging.info("The path to download from is %s", path)

    logging.info("The files in the directory are:")
    logging.info(
        fs.ls_with_blob_prefix(
            bucket_name=directory_path.bucket_name,
            blob_prefix=directory_path.relative_path,
        )
    )

    # Providing a stream buffer to tabula reader does not work because it
    # tries to load the file into the local filesystem, since appengine is a
    # read only filesystem (except for the tmpdir) we download the file into
    # the local tmpdir and pass that in.
    handle = fs.download_to_temp_file(path)
    if not handle:
        raise StateAggregateError(f"Unable to download file: {path}")
    logging.info("Successfully downloaded file from gcs: %s", handle.local_file_path)

    result = parser(handle.local_file_path)
    logging.info("Successfully parsed the report")
    for table, df in result.items():
        dao.write_df(table, df)

    # If we are successful, we want to move the file out of the cloud
    # function triggered directory, and into the historical path.
    historical_path = GcsfsFilePath.from_directory_and_file_name(
        GcsfsDirectoryPath(HISTORICAL_BUCKET.format(project_id), state), filename
    )
    fs.mv(path, historical_path)
    return "", HTTPStatus.OK
Exemple #8
0
def run_export(dry_run: bool, state_code: str, target_bucket: str) -> None:
    """Performs the export operation, exporting rows for the given state codes from the tables from the state dataset
    in the given project to CSV files with the same names as the tables to the given GCS bucket."""
    today = datetime.date.today()

    big_query_client = BigQueryClientImpl()
    dataset_ref = big_query_client.dataset_ref_for_id(STATE_BASE_DATASET)
    if not big_query_client.dataset_exists(dataset_ref):
        raise ValueError(f"Dataset {dataset_ref.dataset_id} does not exist")

    tables = big_query_client.list_tables(dataset_ref.dataset_id)

    export_configs = []
    for table in tables:
        logging.info("******************************")
        export_query = state_table_export_query_str(table,
                                                    [state_code.upper()])
        logging.info(export_query)

        if not export_query:
            continue

        export_dir = gcs_export_directory(target_bucket, today,
                                          state_code.lower())
        export_file_name = f"{table.table_id}_{today.isoformat()}_export.csv"
        file = GcsfsFilePath.from_directory_and_file_name(
            export_dir, export_file_name)
        output_uri = file.uri()

        export_config = ExportQueryConfig(
            query=export_query,
            query_parameters=[],
            intermediate_dataset_id="export_temporary_tables",
            intermediate_table_name=
            f"{dataset_ref.dataset_id}_{table.table_id}_{state_code.lower()}",
            output_uri=output_uri,
            output_format=bigquery.DestinationFormat.CSV,
        )
        export_configs.append(export_config)
        if dry_run:
            logging.info(
                "[DRY RUN] Created export configuration to export table to GCS: %s",
                export_config,
            )
        else:
            logging.info(
                "Created export configuration to export table to GCS: %s",
                export_config)

    if dry_run:
        logging.info("[DRY RUN] Exporting [%d] tables to GCS",
                     len(export_configs))
    else:
        logging.info("Exporting [%d] tables to GCS", len(export_configs))
        big_query_client.export_query_results_to_cloud_storage(
            export_configs, print_header=True)
 def test_noop_without_staging(self) -> None:
     not_pointed_at_staging_file = GcsfsFilePath.from_directory_and_file_name(
         self.config_with_path("gnarly").output_directory,
         "staging_results.txt")
     self.assertEqual(
         ExportBigQueryViewConfig.revert_staging_path_to_original(
             not_pointed_at_staging_file),
         GcsfsFilePath.from_absolute_path(
             "gs://gnarly/staging_results.txt"),
     )
Exemple #10
0
 def _upload_file(
         self, path_with_timestamp: Tuple[str, datetime.datetime]) -> None:
     path, timestamp = path_with_timestamp
     normalized_file_name = os.path.basename(
         to_normalized_unprocessed_file_path(
             path,
             file_type=GcsfsDirectIngestFileType.RAW_DATA,
             dt=timestamp))
     full_file_upload_path = GcsfsFilePath.from_directory_and_file_name(
         self.gcs_destination_path, normalized_file_name)
     self._copy_to_ingest_bucket(path, full_file_upload_path)
 def setUp(self) -> None:
     bucket = gcsfs_direct_ingest_bucket_for_region(
         project_id="recidiviz-456",
         region_code=_REGION.region_code,
         system_level=SystemLevel.STATE,
         ingest_instance=DirectIngestInstance.PRIMARY,
     )
     self.ingest_view_file_path = GcsfsFilePath.from_directory_and_file_name(
         bucket,
         to_normalized_processed_file_name(
             "file_path.csv", GcsfsDirectIngestFileType.INGEST_VIEW),
     )
Exemple #12
0
    def _copy_to_ingest_bucket(self, path: str,
                               normalized_file_name: str) -> None:
        full_file_upload_path_uri = GcsfsFilePath.from_directory_and_file_name(
            self.ingest_bucket, normalized_file_name).uri()

        if not self.dry_run:
            gsutil_cp(path, full_file_upload_path_uri)

        with self.mutex:
            self.copies_list.append((path, full_file_upload_path_uri))
            if self.move_progress:
                self.move_progress.next()
    def _generate_output_path(self,
                              ingest_view_export_args: GcsfsIngestViewExportArgs,
                              metadata: DirectIngestIngestFileMetadata) -> GcsfsFilePath:
        ingest_view = self.ingest_views_by_tag[ingest_view_export_args.ingest_view_name]
        if not metadata.normalized_file_name:
            output_file_name = to_normalized_unprocessed_file_name(
                f'{ingest_view.file_tag}.csv',
                GcsfsDirectIngestFileType.INGEST_VIEW,
                dt=ingest_view_export_args.upper_bound_datetime_to_export
            )
        else:
            output_file_name = metadata.normalized_file_name

        return GcsfsFilePath.from_directory_and_file_name(self.ingest_directory_path, output_file_name)
    def test_happy_path(self) -> None:
        pointed_at_staging_file = GcsfsFilePath.from_directory_and_file_name(
            self.config_with_path(
                "gnarly").pointed_to_staging_subdirectory().output_directory,
            "foo.txt",
        )
        self.assertEqual(pointed_at_staging_file.abs_path(),
                         "gnarly/staging/foo.txt")

        self.assertEqual(
            ExportBigQueryViewConfig.revert_staging_path_to_original(
                pointed_at_staging_file),
            GcsfsFilePath.from_absolute_path("gs://gnarly/foo.txt"),
        )
    def copy(self, src_path: GcsfsFilePath, dst_path: GcsfsPath) -> None:
        src_bucket = self.storage_client.bucket(src_path.bucket_name)
        src_blob = self._get_blob(src_path)

        dst_bucket = self.storage_client.bucket(dst_path.bucket_name)

        if isinstance(dst_path, GcsfsFilePath):
            dst_blob_name = dst_path.blob_name
        elif isinstance(dst_path, GcsfsDirectoryPath):
            dst_blob_name = GcsfsFilePath.from_directory_and_file_name(
                dst_path, src_path.file_name).blob_name
        else:
            raise ValueError(f"Unexpected path type [{type(dst_path)}]")

        src_bucket.copy_blob(src_blob, dst_bucket, dst_blob_name)
    def copy(self, src_path: GcsfsFilePath, dst_path: GcsfsPath) -> None:
        if isinstance(dst_path, GcsfsFilePath):
            path = dst_path
        elif isinstance(dst_path, GcsfsDirectoryPath):
            path = GcsfsFilePath.from_directory_and_file_name(
                dst_path, src_path.file_name)
        else:
            raise ValueError(f"Unexpected path type [{type(dst_path)}]")

        with self.mutex:
            entry = self.files[src_path.abs_path()]
            self.files[path.abs_path()] = FakeGCSFileSystemEntry(
                path, entry.local_path, "application/octet-stream")

        if self.delegate:
            self.delegate.on_file_added(path)
Exemple #17
0
    def copy(self,
             src_path: GcsfsFilePath,
             dst_path: GcsfsPath) -> None:
        src_bucket = self.storage_client.get_bucket(src_path.bucket_name)
        src_blob = src_bucket.get_blob(src_path.blob_name)
        if not src_blob:
            raise ValueError(
                f'Blob at path [{src_path.abs_path()}] does not exist')
        dst_bucket = self.storage_client.get_bucket(dst_path.bucket_name)

        if isinstance(dst_path, GcsfsFilePath):
            dst_blob_name = dst_path.blob_name
        elif isinstance(dst_path, GcsfsDirectoryPath):
            dst_blob_name = \
                GcsfsFilePath.from_directory_and_file_name(
                    dst_path, src_path.file_name).blob_name
        else:
            raise ValueError(f'Unexpected path type [{type(dst_path)}]')

        src_bucket.copy_blob(src_blob, dst_bucket, dst_blob_name)
Exemple #18
0
    def _create_split_file_path(
        self,
        original_file_path: GcsfsFilePath,
        output_dir: GcsfsDirectoryPath,
        split_num: int,
    ) -> GcsfsFilePath:
        parts = filename_parts_from_path(original_file_path)

        rank_str = str(split_num + 1).zfill(5)
        updated_file_name = (
            f"{parts.stripped_file_name}_{rank_str}"
            f"_{SPLIT_FILE_SUFFIX}_size{self.ingest_file_split_line_limit}"
            f".{parts.extension}")

        return GcsfsFilePath.from_directory_and_file_name(
            output_dir,
            to_normalized_unprocessed_file_path(
                updated_file_name,
                file_type=parts.file_type,
                dt=parts.utc_upload_datetime,
            ),
        )
Exemple #19
0
    def get_output_path(self, chunk_num: int) -> GcsfsFilePath:
        name, _extension = os.path.splitext(self.path.file_name)

        return GcsfsFilePath.from_directory_and_file_name(
            self.temp_output_directory_path, f"temp_{name}_{chunk_num}.csv")
Exemple #20
0
 def output_path(self, extension: str) -> GcsfsFilePath:
     file_name = f'{self.view.view_id}.{extension}'
     return GcsfsFilePath.from_directory_and_file_name(self.output_directory, file_name)
    def get_output_path(self, chunk_num: int):
        name, _extension = os.path.splitext(self.path.file_name)

        return GcsfsFilePath.from_directory_and_file_name(
            self.output_directory_path,
            f'temp_direct_ingest_{name}_{chunk_num}.csv')