def sync_instrument(self, instrument: Instrument) -> None: blob_filepaths = instrument.get_blob_filepaths() for file in instrument.files: blob_filepath = blob_filepaths[file] sftp_path = f"{instrument.sftp_path}/{file}" log.info( f"Syncing file from SFTP: {sftp_path} to GCP: {blob_filepath}") self.sync_file(blob_filepath, sftp_path)
def get_instrument_folders(self) -> Dict[str, Instrument]: instruments = {} for folder in self.sftp_connection.listdir( self.sftp_config.survey_source_path): if re.compile(self.config.instrument_regex).match(folder): log.info(f"Instrument folder found - {folder}") instruments[folder] = Instrument( sftp_path=f"{self.sftp_config.survey_source_path}/{folder}" ) return instruments
def _filter_non_bdbx( _self, instruments: Dict[str, Instrument]) -> Dict[str, Instrument]: filtered_instruments = {} for instrument_name, instrument in instruments.items(): file_types = [ pathlib.Path(file).suffix.lower() for file in instrument.files ] if ".bdbx" in file_types: filtered_instruments[instrument_name] = instrument else: log.info("Instrument database file not found - " + f"{instrument_name} - not importing") return filtered_instruments
def _get_instrument_files_for_instrument( self, instrument: Instrument) -> List[str]: instrument_file_list = [] for instrument_file in self.sftp_connection.listdir_attr( instrument.sftp_path): file_extension = pathlib.Path( instrument_file.filename).suffix.lower() if file_extension == ".bdbx": instrument.bdbx_updated_at = datetime.fromtimestamp( instrument_file.st_mtime, tz=timezone.utc) if file_extension in self.config.extension_list: log.info(f"Instrument file found - {instrument_file.filename}") instrument_file_list.append(instrument_file.filename) return instrument_file_list
def generate_bdbx_md5(self, instrument: Instrument) -> str: bdbx_file = instrument.bdbx_file() if not bdbx_file: log.info( f"No bdbx file for '{instrument.sftp_path}' cannot generate an md5" ) return "" bdbx_details = self.sftp_connection.stat(bdbx_file) md5sum = hashlib.md5() chunks = math.ceil(bdbx_details.st_size / self.config.bufsize) sftp_file = self.sftp_connection.open(bdbx_file, bufsize=self.config.bufsize) for chunk in range(chunks): sftp_file.seek(chunk * self.config.bufsize) md5sum.update(sftp_file.read(self.config.bufsize)) return md5sum.hexdigest()
def send_request_to_api(self, instrument_name): # added 10 second timeout exception pass to the api request # because the connection to the api was timing out before # it completed the work. this also allows parallel requests # to be made to the api. log.info(f"Sending request to {self.config.blaise_api_url} " + f"for instrument {instrument_name}") try: requests.post( (f"http://{self.config.blaise_api_url}/api/v1/serverparks/" + f"{self.config.server_park}/instruments/{instrument_name}/data" ), headers={"content-type": "application/json"}, json={"instrumentDataPath": instrument_name}, timeout=10, ) except requests.exceptions.ReadTimeout: pass
def _get_latest_conflicting_instrument( _self, instruments: Dict[str, Instrument], confilcting_instruments: Dict[str, List[str]], instrument_name: str, ) -> Instrument: conflict_instruments = confilcting_instruments[instrument_name.lower()] instrument_conflicts = { instrument_name: instruments[instrument_name] for instrument_name in conflict_instruments } sorted_conflicts = sorted( [instrument for _, instrument in instrument_conflicts.items()], key=operator.attrgetter("bdbx_updated_at"), reverse=True, ) latest_instrument = sorted_conflicts[0] for conflict in sorted_conflicts[1:]: log.info( f"Found newer instrument '{latest_instrument.sftp_path}' " + f"folder - Skipping this folder '{conflict.sftp_path}'") return latest_instrument
def process_instrument(case_mover: CaseMover, instrument_name: str, instrument: Instrument) -> None: log.info( f"Processing instrument - {instrument_name} - {instrument.sftp_path}") if case_mover.bdbx_md5_changed(instrument): log.info(f"Instrument - {instrument_name} - " + "has no changes to the databse file, skipping...") else: log.info(f"Syncing instrument - {instrument_name}") case_mover.sync_instrument(instrument) case_mover.send_request_to_api(instrument.gcp_folder())
def handle_exception(exception): log.error("Exception - %s", exception) log.info("SFTP connection closed") return "Exception occurred", 500
def main(): config = current_app.nisra_config sftp_config = current_app.sftp_config google_storage = init_google_storage(config) if google_storage.bucket is None: return "Connection to bucket failed", 500 log.info("Connecting to SFTP server") cnopts = pysftp.CnOpts() cnopts.hostkeys = None with pysftp.Connection( host=sftp_config.host, username=sftp_config.username, password=sftp_config.password, port=int(sftp_config.port), cnopts=cnopts, ) as sftp_connection: log.info("Connected to SFTP server") sftp = SFTP(sftp_connection, sftp_config, config) case_mover = CaseMover(google_storage, config, sftp) instruments = get_filtered_instruments(sftp) log.info(f"Processing survey - {sftp_config.survey_source_path}") if len(instruments) == 0: log.info("No instrument folders found") return "No instrument folders found, exiting", 200 for instrument_name, instrument in instruments.items(): process_instrument(case_mover, instrument_name, instrument) log.info("SFTP connection closed") log.info("Process complete") return "Process complete", 200
def log(self): log.info(f"bucket_name - {self.bucket_name}") log.info(f"instrument_regex - {self.instrument_regex}") log.info(f"extension_list - {str(self.extension_list)}") log.info(f"server_park - {self.server_park}") log.info(f"blaise_api_url - {self.blaise_api_url}")
from flask import Flask from app.mover import mover from pkg.config import Config from pkg.sftp import SFTPConfig from util.service_logging import log app = Flask(__name__) def load_config(app: Flask) -> None: sftp_config = SFTPConfig.from_env() config = Config.from_env() if sftp_config.survey_source_path == "": log.error("survey_source_path is blank") raise Exception("survey_source_path is blank") config.log() sftp_config.log() app.nisra_config = config app.sftp_config = sftp_config app.register_blueprint(mover) log.info("Application started")
def log(self): log.info(f"survey_source_path - {self.survey_source_path}") log.info(f"sftp_host - {self.host}") log.info(f"sftp_port - {self.port}") log.info(f"sftp_username - {self.username}")