Python GcsfsFilePath.from_directory_and_file_name Exemples, recidiviz.cloud_storage.gcsfs_path.GcsfsFilePath.from_directory_and_file_name Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : download_files_from_sftp.py Projet : Recidiviz/pulse-data

 def _fetch(
     self,
     connection: pysftp.Connection,
     file_path: str,
     file_timestamp: datetime.datetime,
 ) -> None:
     """Fetches data files from the SFTP, tracking which items downloaded and failed to download."""
     normalized_sftp_path = os.path.normpath(file_path)
     normalized_upload_path = GcsfsFilePath.from_directory_and_file_name(
         dir_path=self.download_dir,
         file_name=os.path.basename(
             to_normalized_unprocessed_file_path(
                 normalized_sftp_path,
                 file_type=GcsfsDirectIngestFileType.RAW_DATA,
                 dt=file_timestamp,
             )),
     )
     if not self.postgres_direct_ingest_file_metadata_manager.has_raw_file_been_discovered(
             normalized_upload_path
     ) and not self.postgres_direct_ingest_file_metadata_manager.has_raw_file_been_processed(
             normalized_upload_path):
         logging.info("Downloading %s into %s", normalized_sftp_path,
                      self.download_dir)
         try:
             path = GcsfsFilePath.from_directory_and_file_name(
                 dir_path=self.download_dir, file_name=normalized_sftp_path)
             self.gcsfs.upload_from_contents_handle_stream(
                 path=path,
                 contents_handle=GcsfsSftpFileContentsHandle(
                     sftp_connection=connection, local_file_path=file_path),
                 content_type=BYTES_CONTENT_TYPE,
             )
             logging.info("Post processing %s", path.uri())
             self.downloaded_items.append((
                 self.delegate.post_process_downloads(path, self.gcsfs),
                 file_timestamp,
             ))
         except IOError as e:
             logging.info(
                 "Could not download %s into %s: %s",
                 normalized_sftp_path,
                 self.download_dir,
                 e.args,
             )
             self.unable_to_download_items.append(file_path)
     else:
         logging.info(
             "Skipping downloading %s because it has already been previously downloaded for ingest.",
             normalized_sftp_path,
         )
         self.skipped_files.append(file_path)

Exemple #2

0

Afficher le fichier

Fichier : scrape_aggregate_reports.py Projet : Recidiviz/pulse-data

def build_path(bucket_template: str, state: str,
               pdf_name: str) -> GcsfsFilePath:
    return GcsfsFilePath.from_directory_and_file_name(
        GcsfsDirectoryPath(bucket_template.format(metadata.project_id()),
                           state),
        pdf_name,
    )

Exemple #3

0

Afficher le fichier

Fichier : gcsfs_direct_ingest_controller.py Projet : jazzPouls/pulse-data

    def _create_split_file_path(
        self,
        original_file_path: GcsfsFilePath,
        output_dir: GcsfsDirectoryPath,
        split_num: int,
    ) -> GcsfsFilePath:
        parts = filename_parts_from_path(original_file_path)

        rank_str = str(split_num + 1).zfill(5)
        updated_file_name = (
            f"{parts.stripped_file_name}_{rank_str}"
            f"_{SPLIT_FILE_SUFFIX}_size{self.ingest_file_split_line_limit}"
            f".{parts.extension}")

        file_type = (
            GcsfsDirectIngestFileType.INGEST_VIEW
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else
            GcsfsDirectIngestFileType.UNSPECIFIED)

        return GcsfsFilePath.from_directory_and_file_name(
            output_dir,
            to_normalized_unprocessed_file_path(updated_file_name,
                                                file_type=file_type,
                                                dt=parts.utc_upload_datetime),
        )

Exemple #4

0

Afficher le fichier

Fichier : move_prod_ingest_files_to_raw.py Projet : Leo-Ryu/pulse-data

    def _move_files(self, from_uri: str):
        curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri)
        previous_date_format = filename_parts_from_path(
            curr_gcsfs_file_path).date_str
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")

        path_with_new_file_name = GcsfsFilePath.from_absolute_path(
            to_normalized_unprocessed_file_path_from_normalized_path(
                from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path):
            path_with_new_file_name = GcsfsFilePath.from_absolute_path(
                to_normalized_processed_file_path_from_normalized_path(
                    from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir(
            self.region_storage_raw_dir_path, new_date_format)

        to_uri = GcsfsFilePath.from_directory_and_file_name(
            raw_dir_with_date, path_with_new_file_name.file_name).uri()

        if not self.dry_run:
            gsutil_mv(from_path=from_uri, to_path=to_uri)
        with self.mutex:
            self.move_list.append((from_uri, to_uri))
            if self.move_progress:
                self.move_progress.next()

Exemple #5

0

Afficher le fichier

Fichier : direct_ingest_util.py Projet : Recidiviz/pulse-data

def path_for_fixture_file_in_test_gcs_directory(
    *,
    bucket_path: GcsfsBucketPath,
    filename: str,
    should_normalize: bool,
    file_type: Optional[GcsfsDirectIngestFileType],
    dt: Optional[datetime.datetime] = None,
) -> GcsfsFilePath:
    file_path_str = filename

    if should_normalize:
        if not file_type:
            raise ValueError(
                "Expected file_type for path normalization but got None")
        file_path_str = to_normalized_unprocessed_file_path(
            original_file_path=file_path_str, file_type=file_type, dt=dt)

    file_path = GcsfsFilePath.from_directory_and_file_name(
        dir_path=bucket_path,
        file_name=file_path_str,
    )
    if not isinstance(file_path, GcsfsFilePath):
        raise ValueError(
            f"Expected type GcsfsFilePath, found {type(file_path)} for path: {file_path.abs_path()}"
        )
    return file_path

Exemple #6

0

Afficher le fichier

 def _fetch(
     self,
     connection: pysftp.Connection,
     file_path: str,
     file_timestamp: datetime.datetime,
 ) -> None:
     """Fetches data files from the SFTP, tracking which items downloaded and failed to download."""
     normalized_sftp_path = os.path.normpath(file_path)
     logging.info("Downloading %s into %s", normalized_sftp_path, self.download_dir)
     try:
         path = GcsfsFilePath.from_directory_and_file_name(
             dir_path=self.download_dir, file_name=normalized_sftp_path
         )
         self.gcsfs.upload_from_contents_handle_stream(
             path=path,
             contents_handle=GcsfsSftpFileContentsHandle(
                 sftp_connection=connection, local_file_path=file_path
             ),
             content_type=BYTES_CONTENT_TYPE,
         )
         logging.info("Post processing %s", path.uri())
         self.downloaded_items.append(
             (self.delegate.post_process_downloads(path, self.gcsfs), file_timestamp)
         )
     except IOError as e:
         logging.info(
             "Could not download %s into %s: %s",
             normalized_sftp_path,
             self.download_dir,
             e.args,
         )
         self.unable_to_download_items.append(file_path)

Exemple #7

0

Afficher le fichier

Fichier : parse.py Projet : Recidiviz/pulse-data

def state_aggregate() -> Tuple[str, HTTPStatus]:
    """Calls state aggregates"""
    bucket = get_str_param_value("bucket", request.args)
    state = get_str_param_value("state", request.args)
    filename = get_str_param_value("filename", request.args)
    project_id = metadata.project_id()
    logging.info("The project id is %s", project_id)
    if not bucket or not state or not filename:
        raise StateAggregateError("All of state, bucket, and filename must be provided")
    directory_path = GcsfsDirectoryPath(bucket, state)
    path = GcsfsFilePath.from_directory_and_file_name(directory_path, filename)
    parser = STATE_TO_PARSER[state]
    fs = GcsfsFactory.build()
    logging.info("The path to download from is %s", path)

    logging.info("The files in the directory are:")
    logging.info(
        fs.ls_with_blob_prefix(
            bucket_name=directory_path.bucket_name,
            blob_prefix=directory_path.relative_path,
        )
    )

    # Providing a stream buffer to tabula reader does not work because it
    # tries to load the file into the local filesystem, since appengine is a
    # read only filesystem (except for the tmpdir) we download the file into
    # the local tmpdir and pass that in.
    handle = fs.download_to_temp_file(path)
    if not handle:
        raise StateAggregateError(f"Unable to download file: {path}")
    logging.info("Successfully downloaded file from gcs: %s", handle.local_file_path)

    result = parser(handle.local_file_path)
    logging.info("Successfully parsed the report")
    for table, df in result.items():
        dao.write_df(table, df)

    # If we are successful, we want to move the file out of the cloud
    # function triggered directory, and into the historical path.
    historical_path = GcsfsFilePath.from_directory_and_file_name(
        GcsfsDirectoryPath(HISTORICAL_BUCKET.format(project_id), state), filename
    )
    fs.mv(path, historical_path)
    return "", HTTPStatus.OK

Exemple #8

0

Afficher le fichier

def run_export(dry_run: bool, state_code: str, target_bucket: str) -> None:
    """Performs the export operation, exporting rows for the given state codes from the tables from the state dataset
    in the given project to CSV files with the same names as the tables to the given GCS bucket."""
    today = datetime.date.today()

    big_query_client = BigQueryClientImpl()
    dataset_ref = big_query_client.dataset_ref_for_id(STATE_BASE_DATASET)
    if not big_query_client.dataset_exists(dataset_ref):
        raise ValueError(f"Dataset {dataset_ref.dataset_id} does not exist")

    tables = big_query_client.list_tables(dataset_ref.dataset_id)

    export_configs = []
    for table in tables:
        logging.info("******************************")
        export_query = state_table_export_query_str(table,
                                                    [state_code.upper()])
        logging.info(export_query)

        if not export_query:
            continue

        export_dir = gcs_export_directory(target_bucket, today,
                                          state_code.lower())
        export_file_name = f"{table.table_id}_{today.isoformat()}_export.csv"
        file = GcsfsFilePath.from_directory_and_file_name(
            export_dir, export_file_name)
        output_uri = file.uri()

        export_config = ExportQueryConfig(
            query=export_query,
            query_parameters=[],
            intermediate_dataset_id="export_temporary_tables",
            intermediate_table_name=
            f"{dataset_ref.dataset_id}_{table.table_id}_{state_code.lower()}",
            output_uri=output_uri,
            output_format=bigquery.DestinationFormat.CSV,
        )
        export_configs.append(export_config)
        if dry_run:
            logging.info(
                "[DRY RUN] Created export configuration to export table to GCS: %s",
                export_config,
            )
        else:
            logging.info(
                "Created export configuration to export table to GCS: %s",
                export_config)

    if dry_run:
        logging.info("[DRY RUN] Exporting [%d] tables to GCS",
                     len(export_configs))
    else:
        logging.info("Exporting [%d] tables to GCS", len(export_configs))
        big_query_client.export_query_results_to_cloud_storage(
            export_configs, print_header=True)

Exemple #9

0

Afficher le fichier

Fichier : export_query_config_test.py Projet : Recidiviz/pulse-data

 def test_noop_without_staging(self) -> None:
     not_pointed_at_staging_file = GcsfsFilePath.from_directory_and_file_name(
         self.config_with_path("gnarly").output_directory,
         "staging_results.txt")
     self.assertEqual(
         ExportBigQueryViewConfig.revert_staging_path_to_original(
             not_pointed_at_staging_file),
         GcsfsFilePath.from_absolute_path(
             "gs://gnarly/staging_results.txt"),
     )

Exemple #10

0

Afficher le fichier

 def _upload_file(
         self, path_with_timestamp: Tuple[str, datetime.datetime]) -> None:
     path, timestamp = path_with_timestamp
     normalized_file_name = os.path.basename(
         to_normalized_unprocessed_file_path(
             path,
             file_type=GcsfsDirectIngestFileType.RAW_DATA,
             dt=timestamp))
     full_file_upload_path = GcsfsFilePath.from_directory_and_file_name(
         self.gcs_destination_path, normalized_file_name)
     self._copy_to_ingest_bucket(path, full_file_upload_path)

Exemple #11

0

Afficher le fichier

Fichier : direct_ingest_cloud_task_manager_impl_test.py Projet : Recidiviz/pulse-data

 def setUp(self) -> None:
     bucket = gcsfs_direct_ingest_bucket_for_region(
         project_id="recidiviz-456",
         region_code=_REGION.region_code,
         system_level=SystemLevel.STATE,
         ingest_instance=DirectIngestInstance.PRIMARY,
     )
     self.ingest_view_file_path = GcsfsFilePath.from_directory_and_file_name(
         bucket,
         to_normalized_processed_file_name(
             "file_path.csv", GcsfsDirectIngestFileType.INGEST_VIEW),
     )

Exemple #12

0

Afficher le fichier

    def _copy_to_ingest_bucket(self, path: str,
                               normalized_file_name: str) -> None:
        full_file_upload_path_uri = GcsfsFilePath.from_directory_and_file_name(
            self.ingest_bucket, normalized_file_name).uri()

        if not self.dry_run:
            gsutil_cp(path, full_file_upload_path_uri)

        with self.mutex:
            self.copies_list.append((path, full_file_upload_path_uri))
            if self.move_progress:
                self.move_progress.next()

Exemple #13

0

Afficher le fichier

Fichier : direct_ingest_ingest_view_export_manager.py Projet : Leo-Ryu/pulse-data

    def _generate_output_path(self,
                              ingest_view_export_args: GcsfsIngestViewExportArgs,
                              metadata: DirectIngestIngestFileMetadata) -> GcsfsFilePath:
        ingest_view = self.ingest_views_by_tag[ingest_view_export_args.ingest_view_name]
        if not metadata.normalized_file_name:
            output_file_name = to_normalized_unprocessed_file_name(
                f'{ingest_view.file_tag}.csv',
                GcsfsDirectIngestFileType.INGEST_VIEW,
                dt=ingest_view_export_args.upper_bound_datetime_to_export
            )
        else:
            output_file_name = metadata.normalized_file_name

        return GcsfsFilePath.from_directory_and_file_name(self.ingest_directory_path, output_file_name)

Exemple #14

0

Afficher le fichier

Fichier : export_query_config_test.py Projet : Recidiviz/pulse-data

    def test_happy_path(self) -> None:
        pointed_at_staging_file = GcsfsFilePath.from_directory_and_file_name(
            self.config_with_path(
                "gnarly").pointed_to_staging_subdirectory().output_directory,
            "foo.txt",
        )
        self.assertEqual(pointed_at_staging_file.abs_path(),
                         "gnarly/staging/foo.txt")

        self.assertEqual(
            ExportBigQueryViewConfig.revert_staging_path_to_original(
                pointed_at_staging_file),
            GcsfsFilePath.from_absolute_path("gs://gnarly/foo.txt"),
        )

Exemple #15

0

Afficher le fichier

Fichier : gcs_file_system.py Projet : Recidiviz/pulse-data

    def copy(self, src_path: GcsfsFilePath, dst_path: GcsfsPath) -> None:
        src_bucket = self.storage_client.bucket(src_path.bucket_name)
        src_blob = self._get_blob(src_path)

        dst_bucket = self.storage_client.bucket(dst_path.bucket_name)

        if isinstance(dst_path, GcsfsFilePath):
            dst_blob_name = dst_path.blob_name
        elif isinstance(dst_path, GcsfsDirectoryPath):
            dst_blob_name = GcsfsFilePath.from_directory_and_file_name(
                dst_path, src_path.file_name).blob_name
        else:
            raise ValueError(f"Unexpected path type [{type(dst_path)}]")

        src_bucket.copy_blob(src_blob, dst_bucket, dst_blob_name)

Exemple #16

0

Afficher le fichier

Fichier : fake_gcs_file_system.py Projet : Recidiviz/pulse-data

    def copy(self, src_path: GcsfsFilePath, dst_path: GcsfsPath) -> None:
        if isinstance(dst_path, GcsfsFilePath):
            path = dst_path
        elif isinstance(dst_path, GcsfsDirectoryPath):
            path = GcsfsFilePath.from_directory_and_file_name(
                dst_path, src_path.file_name)
        else:
            raise ValueError(f"Unexpected path type [{type(dst_path)}]")

        with self.mutex:
            entry = self.files[src_path.abs_path()]
            self.files[path.abs_path()] = FakeGCSFileSystemEntry(
                path, entry.local_path, "application/octet-stream")

        if self.delegate:
            self.delegate.on_file_added(path)

Exemple #17

0

Afficher le fichier

Fichier : gcs_file_system.py Projet : Leo-Ryu/pulse-data

    def copy(self,
             src_path: GcsfsFilePath,
             dst_path: GcsfsPath) -> None:
        src_bucket = self.storage_client.get_bucket(src_path.bucket_name)
        src_blob = src_bucket.get_blob(src_path.blob_name)
        if not src_blob:
            raise ValueError(
                f'Blob at path [{src_path.abs_path()}] does not exist')
        dst_bucket = self.storage_client.get_bucket(dst_path.bucket_name)

        if isinstance(dst_path, GcsfsFilePath):
            dst_blob_name = dst_path.blob_name
        elif isinstance(dst_path, GcsfsDirectoryPath):
            dst_blob_name = \
                GcsfsFilePath.from_directory_and_file_name(
                    dst_path, src_path.file_name).blob_name
        else:
            raise ValueError(f'Unexpected path type [{type(dst_path)}]')

        src_bucket.copy_blob(src_blob, dst_bucket, dst_blob_name)

Exemple #18

0

Afficher le fichier

    def _create_split_file_path(
        self,
        original_file_path: GcsfsFilePath,
        output_dir: GcsfsDirectoryPath,
        split_num: int,
    ) -> GcsfsFilePath:
        parts = filename_parts_from_path(original_file_path)

        rank_str = str(split_num + 1).zfill(5)
        updated_file_name = (
            f"{parts.stripped_file_name}_{rank_str}"
            f"_{SPLIT_FILE_SUFFIX}_size{self.ingest_file_split_line_limit}"
            f".{parts.extension}")

        return GcsfsFilePath.from_directory_and_file_name(
            output_dir,
            to_normalized_unprocessed_file_path(
                updated_file_name,
                file_type=parts.file_type,
                dt=parts.utc_upload_datetime,
            ),
        )

Exemple #19

0

Afficher le fichier

    def get_output_path(self, chunk_num: int) -> GcsfsFilePath:
        name, _extension = os.path.splitext(self.path.file_name)

        return GcsfsFilePath.from_directory_and_file_name(
            self.temp_output_directory_path, f"temp_{name}_{chunk_num}.csv")

Exemple #20

0

Afficher le fichier

 def output_path(self, extension: str) -> GcsfsFilePath:
     file_name = f'{self.view.view_id}.{extension}'
     return GcsfsFilePath.from_directory_and_file_name(self.output_directory, file_name)

Exemple #21

0

Afficher le fichier

Fichier : csv_gcsfs_direct_ingest_controller.py Projet : Leo-Ryu/pulse-data

    def get_output_path(self, chunk_num: int):
        name, _extension = os.path.splitext(self.path.file_name)

        return GcsfsFilePath.from_directory_and_file_name(
            self.output_directory_path,
            f'temp_direct_ingest_{name}_{chunk_num}.csv')