class DownloadFilesFromSftpController:
    """Class for interacting with and downloading files from SFTP servers."""
    def __init__(
        self,
        project_id: str,
        region: str,
        lower_bound_update_datetime: Optional[datetime.datetime],
        gcs_destination_path: Optional[GcsfsDirectoryPath] = None,
    ):
        self.project_id = project_id
        self.region = region.lower()

        self.auth = SftpAuth.for_region(region)
        self.delegate = SftpDownloadDelegateFactory.build(region_code=region)
        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())

        self.unable_to_download_items: List[str] = []
        self.downloaded_items: List[Tuple[str, datetime.datetime]] = []
        self.skipped_files: List[str] = []

        self.lower_bound_update_datetime = lower_bound_update_datetime
        self.bucket = (gcsfs_sftp_download_bucket_path_for_region(
            region, SystemLevel.STATE, project_id=self.project_id) if
                       gcs_destination_path is None else gcs_destination_path)
        self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir(
            dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY)

        self.postgres_direct_ingest_file_metadata_manager = (
            PostgresDirectIngestRawFileMetadataManager(
                region,
                DirectIngestInstance.PRIMARY.database_version(
                    SystemLevel.STATE,
                    state_code=StateCode(self.region.upper())).name,
            ))

    def _is_after_update_bound(self, sftp_attr: SFTPAttributes) -> bool:
        if self.lower_bound_update_datetime is None:
            return True
        update_time = datetime.datetime.fromtimestamp(sftp_attr.st_mtime)
        if self.lower_bound_update_datetime.tzinfo:
            update_time = update_time.astimezone(pytz.UTC)
        return update_time >= self.lower_bound_update_datetime

    def _fetch(
        self,
        connection: pysftp.Connection,
        file_path: str,
        file_timestamp: datetime.datetime,
    ) -> None:
        """Fetches data files from the SFTP, tracking which items downloaded and failed to download."""
        normalized_sftp_path = os.path.normpath(file_path)
        normalized_upload_path = GcsfsFilePath.from_directory_and_file_name(
            dir_path=self.download_dir,
            file_name=os.path.basename(
                to_normalized_unprocessed_file_path(
                    normalized_sftp_path,
                    file_type=GcsfsDirectIngestFileType.RAW_DATA,
                    dt=file_timestamp,
                )),
        )
        if not self.postgres_direct_ingest_file_metadata_manager.has_raw_file_been_discovered(
                normalized_upload_path
        ) and not self.postgres_direct_ingest_file_metadata_manager.has_raw_file_been_processed(
                normalized_upload_path):
            logging.info("Downloading %s into %s", normalized_sftp_path,
                         self.download_dir)
            try:
                path = GcsfsFilePath.from_directory_and_file_name(
                    dir_path=self.download_dir, file_name=normalized_sftp_path)
                self.gcsfs.upload_from_contents_handle_stream(
                    path=path,
                    contents_handle=GcsfsSftpFileContentsHandle(
                        sftp_connection=connection, local_file_path=file_path),
                    content_type=BYTES_CONTENT_TYPE,
                )
                logging.info("Post processing %s", path.uri())
                self.downloaded_items.append((
                    self.delegate.post_process_downloads(path, self.gcsfs),
                    file_timestamp,
                ))
            except IOError as e:
                logging.info(
                    "Could not download %s into %s: %s",
                    normalized_sftp_path,
                    self.download_dir,
                    e.args,
                )
                self.unable_to_download_items.append(file_path)
        else:
            logging.info(
                "Skipping downloading %s because it has already been previously downloaded for ingest.",
                normalized_sftp_path,
            )
            self.skipped_files.append(file_path)

    def get_paths_to_download(self) -> List[Tuple[str, datetime.datetime]]:
        """Opens a connection to SFTP and based on the delegate, find and recursively list items
        that are after the update bound and match the delegate's criteria, returning items and
        corresponding timestamps that are to be downloaded."""
        with pysftp.Connection(
                host=self.auth.hostname,
                username=self.auth.username,
                password=self.auth.password,
                cnopts=self.auth.connection_options,
        ) as connection:
            remote_dirs = connection.listdir()
            root = self.delegate.root_directory(remote_dirs)
            dirs_with_attributes = connection.listdir_attr(root)
            paths_post_timestamp = {
                sftp_attr.filename: datetime.datetime.fromtimestamp(
                    sftp_attr.st_mtime).astimezone(pytz.UTC)
                for sftp_attr in dirs_with_attributes
                if self._is_after_update_bound(sftp_attr)
            }
            paths_to_download = self.delegate.filter_paths(
                list(paths_post_timestamp.keys()))

            files_to_download_with_timestamps: List[Tuple[
                str, datetime.datetime]] = []
            for path in paths_to_download:
                file_timestamp = paths_post_timestamp[path]
                if connection.isdir(path):

                    def set_file(file_to_fetch: str,
                                 file_timestamp: datetime.datetime) -> None:
                        files_to_download_with_timestamps.append(
                            (file_to_fetch, file_timestamp))

                    connection.walktree(
                        remotepath=path,
                        fcallback=partial(set_file,
                                          file_timestamp=file_timestamp),
                        dcallback=lambda _: None,
                        ucallback=self.unable_to_download_items.append,
                        recurse=True,
                    )
                else:
                    files_to_download_with_timestamps.append(
                        (path, file_timestamp))
            return files_to_download_with_timestamps

    def clean_up(self) -> None:
        """Attempts to recursively remove any downloaded folders created as part of do_fetch."""
        try:
            logging.info("Cleaning up items in %s.", self.download_dir.uri())
            files_to_delete = self.gcsfs.ls_with_blob_prefix(
                bucket_name=self.bucket.abs_path(),
                blob_prefix=RAW_INGEST_DIRECTORY)
            for file in files_to_delete:
                self.gcsfs.delete(
                    GcsfsFilePath.from_absolute_path(file.abs_path()))
        except Exception as e:
            logging.info(
                "%s could not be cleaned up due to an error %s.",
                self.download_dir.uri(),
                e.args,
            )

    def fetch_files(
        self, files_to_download_with_timestamps: List[Tuple[str,
                                                            datetime.datetime]]
    ) -> None:
        """Opens up one connection and loops through all of the files with timestamps to upload
        to the GCS bucket."""
        with pysftp.Connection(
                host=self.auth.hostname,
                username=self.auth.username,
                password=self.auth.password,
                cnopts=self.auth.connection_options,
        ) as connection:
            for file_path, file_timestamp in files_to_download_with_timestamps:
                self._fetch(connection, file_path, file_timestamp)

    def do_fetch(
        self,
    ) -> MultiRequestResultWithSkipped[Tuple[str, datetime.datetime], str,
                                       str]:
        """Attempts to open an SFTP connection and download items, returning the corresponding paths
        and the timestamp associated, and also any unable to be downloaded."""
        logging.info(
            "Downloading raw files from SFTP server [%s] to ingest bucket [%s] for project [%s]",
            self.auth.hostname,
            self.bucket.uri(),
            self.project_id,
        )

        files_to_download_with_timestamps = self.get_paths_to_download()
        logging.info(
            "Found %s items to download from SFTP server [%s] to upload to ingest bucket [%s]",
            len(files_to_download_with_timestamps),
            self.auth.hostname,
            self.bucket,
        )

        self.fetch_files(files_to_download_with_timestamps)

        logging.info(
            "Download complete, successfully downloaded %s files to ingest bucket [%s] "
            "could not download %s files and skipped %s files",
            len(self.downloaded_items),
            self.download_dir.uri(),
            len(self.unable_to_download_items),
            len(self.skipped_files),
        )
        return MultiRequestResultWithSkipped(
            successes=self.downloaded_items,
            failures=self.unable_to_download_items,
            skipped=self.skipped_files,
        )
Beispiel #2
0
class UploadStateFilesToIngestBucketController:
    """Class for uploading files from a local filesystem to a region's ingest bucket."""

    SUPPORTED_EXTENSIONS: List[str] = [".csv", ".txt"]

    def __init__(
        self,
        paths_with_timestamps: List[Tuple[str, datetime.datetime]],
        project_id: str,
        region: str,
        gcs_destination_path: Optional[str] = None,
    ):
        self.paths_with_timestamps = paths_with_timestamps
        self.project_id = project_id
        self.region = region.lower()

        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.gcs_destination_path = (
            GcsfsDirectoryPath.from_absolute_path(
                gcsfs_direct_ingest_directory_path_for_region(
                    region, SystemLevel.STATE, project_id=self.project_id))
            if gcs_destination_path is None else
            GcsfsDirectoryPath.from_absolute_path(gcs_destination_path))
        self.uploaded_files: List[str] = []
        self.unable_to_upload_files: List[str] = []

    def _copy_to_ingest_bucket(self, path: str,
                               full_file_upload_path: GcsfsFilePath) -> None:
        try:
            mimetype, _ = guess_type(os.path.basename(path))
            self.gcsfs.mv(
                src_path=GcsfsFilePath.from_absolute_path(path),
                dst_path=full_file_upload_path,
            )
            self.gcsfs.set_content_type(full_file_upload_path,
                                        mimetype if mimetype else "text/plain")
            logging.info("Copied %s -> %s", path, full_file_upload_path.uri())
            self.uploaded_files.append(path)
        except BaseException as e:
            logging.warning(
                "Could not copy %s -> %s due to error %s",
                path,
                full_file_upload_path.uri(),
                e.args,
            )
            self.unable_to_upload_files.append(path)

    def _upload_file(
            self, path_with_timestamp: Tuple[str, datetime.datetime]) -> None:
        path, timestamp = path_with_timestamp
        normalized_file_name = os.path.basename(
            to_normalized_unprocessed_file_path(
                path,
                file_type=GcsfsDirectIngestFileType.RAW_DATA,
                dt=timestamp))
        full_file_upload_path = GcsfsFilePath.from_directory_and_file_name(
            self.gcs_destination_path, normalized_file_name)
        self._copy_to_ingest_bucket(path, full_file_upload_path)

    def get_paths_to_upload(self) -> List[Tuple[str, datetime.datetime]]:
        """Returns the appropriate paths to upload and the proper associated timestamp that
        it is to be normalized with. Skips any files that are not properly supported."""
        path_candidates = []
        for path, timestamp in self.paths_with_timestamps:
            if self.gcsfs.is_dir(path):
                directory = GcsfsDirectoryPath.from_absolute_path(path)
                files_in_directory = self.gcsfs.ls_with_blob_prefix(
                    bucket_name=directory.bucket_name,
                    blob_prefix=directory.relative_path,
                )
                for file in files_in_directory:
                    path_candidates.append((file.abs_path(), timestamp))
            elif self.gcsfs.is_file(path):
                file = GcsfsFilePath.from_absolute_path(path)
                path_candidates.append((file.abs_path(), timestamp))
            else:
                logging.warning(
                    "Could not indicate %s as a directory or a file in %s. Skipping",
                    path,
                    self.gcs_destination_path.uri(),
                )
                self.unable_to_upload_files.append(path)
                continue

        result = []
        for path, timestamp in path_candidates:
            _, ext = os.path.splitext(path)
            if not ext or ext not in self.SUPPORTED_EXTENSIONS:
                logging.info("Skipping file [%s] - invalid extension %s", path,
                             ext)
                continue
            result.append((path, timestamp))

        return result

    def upload_files(
        self,
        paths_with_timestamps_to_upload: List[Tuple[str, datetime.datetime]],
        thread_pool: ThreadPool,
    ) -> None:
        thread_pool.map(self._upload_file, paths_with_timestamps_to_upload)

    def do_upload(self) -> Tuple[List[str], List[str]]:
        """Perform upload to ingest bucket."""
        logging.info(
            "Uploading raw files to the %s ingest bucket [%s] for project [%s].",
            self.region,
            self.gcs_destination_path.uri(),
            self.project_id,
        )

        paths_with_timestamps_to_upload = self.get_paths_to_upload()
        logging.info(
            "Found %s items to upload to ingest bucket [%s]",
            len(paths_with_timestamps_to_upload),
            self.gcs_destination_path.uri(),
        )

        thread_pool = ThreadPool(processes=12)

        self.upload_files(paths_with_timestamps_to_upload, thread_pool)

        thread_pool.close()
        thread_pool.join()

        logging.info(
            "Upload complete, successfully uploaded %s files to ingest bucket [%s], "
            "could not upload %s files",
            len(self.uploaded_files),
            self.gcs_destination_path.uri(),
            len(self.unable_to_upload_files),
        )

        return self.uploaded_files, self.unable_to_upload_files