def execute(self, context): sftp_hook = SFTPHook(ftp_conn_id=self.sftp_conn_id) s3_hook = S3Hook(self.s3_conn_id) sftp_files = sftp_hook.list_directory(self.sftp_path) filtered_files_by_extensions = [ key for key in sftp_files if key.lower().endswith(self.file_extensions) ] for sftp_file in filtered_files_by_extensions: with NamedTemporaryFile("w") as f: sftp_hook.retrieve_file(f'{self.sftp_path}/{sftp_file}', f.name) s3_key = self.get_s3_key(f'{self.s3_prefix}/{sftp_file}') s3_hook.load_file(filename=f.name, key=s3_key, bucket_name=self.s3_bucket, replace=True) # Add the empty _SUCCESS file to indicate the task is done successfully s3_key = self.get_s3_key(f'{self.s3_prefix}/_SUCCESS') s3_hook.load_string('', key=s3_key, bucket_name=self.s3_bucket, replace=True)
class SFTPSensor(BaseSensorOperator): """ Waits for a file or directory to be present on SFTP. :param path: Remote file or directory path :type path: str :param sftp_conn_id: The connection to run the sensor against :type sftp_conn_id: str """ template_fields = ('path', ) @apply_defaults def __init__(self, path, sftp_conn_id='sftp_default', *args, **kwargs): super(SFTPSensor, self).__init__(*args, **kwargs) self.path = path self.hook = SFTPHook(sftp_conn_id=sftp_conn_id) def poke(self, context): logging.info('Poking for %s', self.path) try: self.hook.get_mod_time(self.path) except IOError as e: if e.errno != SFTP_NO_SUCH_FILE: raise e return False self.hook.close_conn() return True
def setUp(self): configuration.load_test_config() self.old_login = self.update_connection(SFTP_CONNECTION_USER) self.hook = SFTPHook() os.makedirs(os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS)) with open(os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS), 'a') as file: file.write('Test file')
class SFTPSensor(BaseSensorOperator): @apply_defaults def __init__(self, filepath, filepattern, sftp_conn_id='sftp_default', *args, **kwargs): super(SFTPSensor, self).__init__(*args, **kwargs) self.filepath = filepath self.filepattern = filepattern self.sftp_conn_id = sftp_conn_id self.hook = SFTPHook(ftp_conn_id=sftp_conn_id, keepalive_interval=10) def poke(self, context): full_path = self.filepath file_pattern = re.compile(self.filepattern) fileDict = {} fileList = [] try: isFound = False directory = self.hook.describe_directory(full_path) logging.info('Polling Interval 1') for file in directory.keys(): if not re.match(file_pattern, file): self.log.info(file) self.log.info(file_pattern) del directory[file] if not directory: # If directory has no files that match the mask, exit return isFound # wait before we compare file sizes and timestamps again to # verify that the file is done transferring to remote loc time.sleep(30) logging.info('Post-Wait Polling') newDirectoryResults = self.hook.describe_directory(full_path) for file in newDirectoryResults.keys(): if file in directory.keys(): if newDirectoryResults[file]['size'] == directory[file]['size'] and \ newDirectoryResults[file]['modify'] == directory[file]['modify']: fileList.append(file) print( 'filename: {} with size {} and modified time of {} met all criteria to be moved.' .format(file, newDirectoryResults[file]['size'], newDirectoryResults[file]['modify'])) isFound = True context["task_instance"].xcom_push("file_name", fileList) return isFound except IOError as e: if e.errno != SFTP_NO_SUCH_FILE: raise e return False
def execute(self, context): source_hook = SFTPHook(ftp_conn_id=self.source_conn_id) source_files = source_hook.list_directory(self.source_path) for target in self.target_full_path: target_connection = target[0] target_path = target[1] self.log.info( f"Beginning transfer to SFTP site {target_connection} and directory {target_path}" ) target_hook = SFTPHook(ftp_conn_id=target_connection) target_files = target_hook.list_directory(target_path) for file in source_files: if self.filter_function is None or self.filter_function(file): if self.overwrite_target is True or file not in target_files: source_hook.retrieve_file( op.join(self.source_path, file), op.join(self.work_path, file)) self.log.info( "Downloaded the file %s from the source SFTP", file) try: target_hook.store_file( op.join(target_path, file), op.join(self.work_path, file)) self.log.info( "Uploaded the file %s to the destination SFTP", file) finally: os.remove(os.path.join(self.work_path, file))
def execute(self, context): sftp_hook = SFTPHook(ftp_conn_id=self.sftp_conn_id) s3_hook = S3Hook(self.s3_conn_id) s3_client = s3_hook.get_conn() sftp_client = sftp_hook.get_conn() s3_keys = s3_hook.list_keys(self.s3_bucket, prefix=self.s3_prefix) s3_keys_filtered_by_extensions = [ s3_key for s3_key in s3_keys if s3_key.lower().endswith(self.file_extensions) ] part_count = 0 for s3_key in s3_keys_filtered_by_extensions: with NamedTemporaryFile("w") as f: s3_client.download_file(self.s3_bucket, s3_key, f.name) _, file_extension = os.path.splitext(s3_key) remote_filename = f'{self.sftp_filename_prefix}-part-{part_count}{file_extension}' remote_path = os.path.join(self.sftp_path, remote_filename) sftp_client.put(f.name, remote_path) part_count += 1
class SFTPSensor(BaseSensorOperator): """ Waits for a file or directory to be present on SFTP. :param path: Remote file or directory path :type path: str :param sftp_conn_id: The connection to run the sensor against :type sftp_conn_id: str """ template_fields = ('path',) @apply_defaults def __init__(self, path, sftp_conn_id='sftp_default', *args, **kwargs): super().__init__(*args, **kwargs) self.path = path self.hook = SFTPHook(sftp_conn_id) def poke(self, context): self.log.info('Poking for %s', self.path) try: self.hook.get_mod_time(self.path) except IOError as e: if e.errno != SFTP_NO_SUCH_FILE: raise e return False self.hook.close_conn() return True
def check_file(**context): """Recebe como input path para um ficheiro, uma conn id para a maquina com o ficheiro e testa se o ficheiro existe retornando os valore definidos nos inputs if_true e if_false Inputs: templates_dict: f_path str -- full path para o ficheiro (templated) op_kwargs: if_true [any] -- valor de retorno se ficheiro existir if_false [any] -- valor de retorno se ficheiro não existir Returns: [any] -- Retorno definido pelas variaveis if_true e if_false """ f_path = context['templates_dict']['file_path'] conn = context['conn_id'] if_true = context['id_true'] if_false = context['id_false'] sh = SFTPHook(conn) if sh.path_exists(f_path): return if_true else: return if_false
class SFTPSensor(BaseSensorOperator): """ #Airflow sftp sensor monitors a particular location for a particular file pattern """ @apply_defaults def __init__(self, filepath, filepattern, sftp_conn_id='sftp_default', *args, **kwargs): super(SFTPSensor, self).__init__(*args, **kwargs) self.filepath = filepath self.filepattern = filepattern self.hook = SFTPHook(sftp_conn_id) def poke(self, context): full_path = self.filepath dict_files = {} oldest_file = "" files = self.hook.list_directory(full_path) pattern = self.filepattern for file in files: if not fnmatch.fnmatch(file, pattern): self.log.info(file) self.log.info(pattern) else: self.log.info("File found {}".format(file)) dict_files[int(self.hook.get_mod_time(full_path + "/" + file))] = file print("files found with modified time : {0}".format(dict_files)) length_dict = len(dict_files) if length_dict > 0: dict_of_files_sorted = sorted(list(dict_files.keys())) oldest_file = dict_files[dict_of_files_sorted[0]] context["task_instance"].xcom_push("file_name", oldest_file) self.log.info("xcom_pushed : {}".format(oldest_file)) return True
def moveFromSourceToLocal(**kwargs): """ Use information from the dag_run passed in by the filefinder DAG to start pulling down a ready file. """ # Variablelize (my word) the dag_run config needed for this step. # This might be a good candidate for externalizing sftpConn = kwargs['dag_run'].conf['SFTP_Connection_Name'] sourceFullPath = kwargs['dag_run'].conf['File_Name'] # Strip the ".ready" from the filename as we get the basename of the file fileName = os.path.basename(kwargs['dag_run'].conf['File_Name']).replace( '.ready', '') destFullPath = os.path.join(LOCAL_LANDING_PATH, fileName) sftpHook = SFTPHook(ftp_conn_id=sftpConn) conn = sftpHook.get_conn() initialMD5sum = getMD5sumRemote(conn, sourceFullPath) logging.info('Initial MD5Sum: {}'.format(initialMD5sum)) sftpHook.retrieve_file(sourceFullPath, destFullPath) currentMD5sum = getMD5sumLocal(destFullPath) logging.info('currentMD5Sum: {}'.format(currentMD5sum)) if initialMD5sum != currentMD5sum: logging.error( 'MD5Sum mismatch. Initial: {} Post-Transfer: {}'.format( initialMD5sum, currentMD5sum)) raise Exception( 'MD5Sum values before and after transfer do not match. Possible transfer issue. Initial: {} Post-Transfer: {}' .format(initialMD5sum, currentMD5sum))
def _copy_single_object( self, gcs_hook: GoogleCloudStorageHook, sftp_hook: SFTPHook, source_object: str, destination_path: str, ) -> None: """ Helper function to copy single object. """ self.log.info( "Executing copy of gs://%s/%s to %s", self.source_bucket, source_object, destination_path, ) dir_path = os.path.dirname(destination_path) sftp_hook.create_directory(dir_path) with NamedTemporaryFile("w") as tmp: gcs_hook.download( bucket_name=self.source_bucket, object_name=source_object, filename=tmp.name, ) sftp_hook.store_file(destination_path, tmp.name) if self.move_object: self.log.info("Executing delete of gs://%s/%s", self.source_bucket, source_object) gcs_hook.delete(self.source_bucket, source_object)
def execute(self, context): self.log.info("Going to start delete file sftp operator") sftp_hook = SFTPHook(ftp_conn_id=self.sftp_conn_id) sftp_hook.no_host_key_check = True sftp_hook.delete_file(self.file_path) self.log.info("Finished executing delete file sftp operator") return True
def _copy_single_object( self, gcs_hook: GoogleCloudStorageHook, sftp_hook: SFTPHook, source_path: str, destination_object: str, ) -> None: """ Helper function to copy single object. """ self.log.info( "Executing copy of %s to gs://%s/%s", source_path, self.destination_bucket, destination_object, ) with NamedTemporaryFile("w") as tmp: sftp_hook.retrieve_file(source_path, tmp.name) gcs_hook.upload( bucket_name=self.destination_bucket, object_name=destination_object, filename=tmp.name, mime_type=self.mime_type, ) if self.move_object: self.log.info("Executing delete of %s", source_path) sftp_hook.delete_file(source_path)
def execute(self, context): gcs_hook = GoogleCloudStorageHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) sftp_hook = SFTPHook(self.sftp_conn_id) if WILDCARD in self.source_path: total_wildcards = self.source_path.count(WILDCARD) if total_wildcards > 1: raise AirflowException( "Only one wildcard '*' is allowed in source_path parameter. " "Found {} in {}.".format(total_wildcards, self.source_path)) prefix, delimiter = self.source_path.split(WILDCARD, 1) base_path = os.path.dirname(prefix) files, _, _ = sftp_hook.get_tree_map(base_path, prefix=prefix, delimiter=delimiter) for file in files: destination_path = file.replace(base_path, self.destination_path, 1) self._copy_single_object(gcs_hook, sftp_hook, file, destination_path) else: destination_object = (self.destination_path if self.destination_path else self.source_path.rsplit("/", 1)[1]) self._copy_single_object(gcs_hook, sftp_hook, self.source_path, destination_object)
def createTestFile(**kwargs): """ Create a test file on one of the SFTP sites to initiate the transfer process """ SFTP_Name = dag_config['SFTP_Polling_Sites'][0]['SFTP_Name'] SFTP_Connection_Name = dag_config['SFTP_Polling_Sites'][0]['SFTP_Connection_Name'] SFTP_Destination_Path = dag_config['SFTP_Polling_Sites'][0]['Feed_Groups'][0]['Feed_Group_Location'] fileName = os.path.join(SFTP_Destination_Path, 'testfile_{}.txt'.format(randint(0, 9999999))) createFileCommand = "echo 'Hello World!' > {}".format(fileName) gpgCommand = "gpg --output {}.gpg -e -r [email protected] {}".format(fileName, fileName) sftpHook = SFTPHook(ftp_conn_id = SFTP_Connection_Name) print('SFTP_Name: {}'.format(SFTP_Name)) print('SFTP_Connection: {}'.format(SFTP_Connection_Name)) print('SFTP_Destination_Path: {}'.format(SFTP_Destination_Path)) print('Random Filename: {}'.format(fileName)) print('GPG Command: {}'.format(gpgCommand)) conn = sftpHook.get_conn() tempResults = conn.execute(createFileCommand) decodedString = [x.decode('utf-8') for x in tempResults] print('Create File Results: {}'.format(decodedString)) tempResults = conn.execute(gpgCommand) decodedString = [x.decode('utf-8') for x in tempResults] print('GPG Results: {}'.format(decodedString))
def pollForFiles(**kwargs): # Create some local scope variables for use later in proc sftpConnName = kwargs['SFTP_Connection_Name'] feedGroups = kwargs['Feed_Groups'] # Connect to SFTP site using provided credentials - should be saved in Connections sourceHook = SFTPHook(ftp_conn_id = sftpConnName) # Create empty dictionary for storing files that match file masks fileMatches = {} # Loop through feed locations and their regex for this SFTP site. for i in feedGroups: fullPath = i['Feed_Group_Location'] filePattern = i['Feed_Group_Regex'] feedGroupName = i['Feed_Group_Name'] logging.info('Evaluating Feed Group {}'.format(feedGroupName)) try: directory = sourceHook.describe_directory(path = fullPath) for file in directory.keys(): if re.match(filePattern, file): fileMatches[os.path.join(fullPath, file)] = directory[file] except Exception as e: logging.error('Error attempting to poll feed group {} in directory {}'.format(feedGroupName, fullPath)) raise e # If we do not find a file that matches a file mask in any of the directories, exit. if not fileMatches: return 0 # If no trigger files or renaming is utilized by the client when placing files on SFTP, we # have to resort to polling for files, waiting for a time period and then comparing the size/modified time # to see if they are ready to pull down. time.sleep(SLEEP_TIME) for j in feedGroups: fullPath = j['Feed_Group_Location'] filePattern = j['Feed_Group_Regex'] feedGroupName = j['Feed_Group_Name'] logging.info('Evaluating Feed Group {} after sleeping'.format(feedGroupName)) try: newDirResults = sourceHook.describe_directory(fullPath) for file in newDirResults: fullFilePath = os.path.join(fullPath, file) if fullFilePath in fileMatches.keys(): if newDirResults[file]['size'] == fileMatches[fullFilePath]['size'] and \ newDirResults[file]['modify'] == fileMatches[fullFilePath]['modify']: # If file hasn't changed size or modified time since first look, set to ready for another process to pick up and transfer. sourceHook.conn.rename(fullFilePath, fullFilePath + '.ready') logging.info('Tagged the {} file as ready.'.format(fullFilePath)) except Exception as e: logging.error('Error attempting to rename files in feed group {} in directory {}'.format(feedGroupName, fullPath)) raise e
def execute(self, context): conn = SFTPHook(ftp_conn_id=self.conn_id) my_conn = conn.get_conn() total_size = my_conn.lstat(self.file_path).st_size master_variable_dict = Variable.get(self.master_variable) master_variable_dict[self.chunks_variable_name] = math.ceil(total_size / self.chunk_size) Variable.set(self.master_variable, master_variable_dict) time.sleep(5)
def execute(self, context): self.log.info("Going to start Rename SFTP Operator") sftp_hook = SFTPHook(ftp_conn_id=self.sftp_conn_id) sftp_hook.no_host_key_check = True conn = sftp_hook.get_conn() try: conn.rename(self.source_file, self.dest_file) except IOError: self.log.info("File not found, skipping") self.log.info("Finished executing RenameSFTPOperator")
def __init__(self, filepath, filepattern, sftp_conn_id='sftp_default', *args, **kwargs): super(SFTPSensor, self).__init__(*args, **kwargs) self.filepath = filepath self.filepattern = filepattern self.hook = SFTPHook(sftp_conn_id)
def archive_files_in_sftp(**context): sftp_conn = SFTPHook(ftp_conn_id=ALMA_SFTP_CONNECTION_ID) # Paramiko is the underlying package used for SSH/SFTP conns # the paramiko client exposes a lot more core SFTP functionality paramiko_conn = sftp_conn.get_conn() most_recent_date = context['task_instance'].xcom_pull( task_ids='get_list_of_alma_sftp_files_to_transer', key='most_recent_date') list_of_files = context['task_instance'].xcom_pull( task_ids='get_list_of_alma_sftp_files_to_transer') archive_path = "archive" if archive_path not in sftp_conn.list_directory("./"): sftp_conn.create_directory(path=f"./{archive_path}") elif str(most_recent_date) not in sftp_conn.list_directory( f"./{archive_path}"): sftp_conn.create_directory(f"./{archive_path}/{most_recent_date}") count = 0 for filename in list_of_files: logging.info( f"Moving {filename} to {archive_path}/{most_recent_date}/{filename}" ) paramiko_conn.rename(f"{filename}", f"{archive_path}/{most_recent_date}/{filename}") count += 1 return count
def execute(self, context): self.log.info("Going to start Bulk sftp to s3 operator") sftp_hook = SFTPHook(ftp_conn_id=self.sftp_conn_id) sftp_hook.no_host_key_check = True list_dir = sftp_hook.list_directory(self.sftp_path) if len(list_dir) < 1: self.log.info("Got no files to process. Skipping") return False self.log.info(f"Got {len(list_dir)} files to move") temp_files = [] file_path_list = [] ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() s3_hook = S3Hook(self.aws_conn_id) for file_name in list_dir: file_path = os.path.join(self.sftp_path, file_name) file_path_list.append(file_path) s3_key = str(os.path.join(self.dest_path, file_name)) file_metadata = {"ftp": NamedTemporaryFile("w"), "s3_key": s3_key} for i in range(0, 5): try: self.log.info(f"Downloading {file_path}") sftp_client.get(file_path, file_metadata["ftp"].name) file_metadata["ftp"].flush() temp_files.append(file_metadata) break except Exception: self.log.info( f"Got no response from server, waiting for next try number {(i + 1)}" ) if i < 4: time.sleep(2 ** i + random.random()) sftp_client = ( SSHHook(ssh_conn_id=self.sftp_conn_id) .get_conn() .open_sftp() ) else: raise self.log.info(f"Uploading to S3 with {self.workers} workers") with Pool(self.workers) as pool: pool.starmap( s3_hook.load_file, [ (x["ftp"].name, x["s3_key"], self.dest_bucket, True, False) for x in temp_files ], ) self.log.info("Finished executing Bulk sftp to s3 operator") return file_path_list
def poke(self, context): self.hook = SFTPHook(self.sftp_conn_id) self.log.info('Poking for %s', self.path) try: self.hook.get_mod_time(self.path) except IOError as e: if e.errno != SFTP_NO_SUCH_FILE: raise e return False self.hook.close_conn() return True
def __init__(self, filepath, filepattern, sftp_conn_id='sftp_default', *args, **kwargs): super(SFTPSensor, self).__init__(*args, **kwargs) self.filepath = filepath self.filepattern = filepattern self.sftp_conn_id = sftp_conn_id self.hook = SFTPHook(ftp_conn_id=sftp_conn_id, keepalive_interval=10)
def removeFileFromSFTP(**kwargs): """ Delete file from SFTP """ sftpConn = kwargs['dag_run'].conf['SFTP_Connection_Name'] fileName = kwargs['dag_run'].conf['File_Name'] sftpHook = SFTPHook(ftp_conn_id=sftpConn) logging.info('Attempting to delete {} from {}'.format(fileName, sftpConn)) sftpHook.delete_file(fileName) logging.info('Deletion Successful')
def pollForFiles(**kwargs): """ Poll the flatfiles directory for files to process. """ fileRegex = r'^BP_STORE.*\.txt$' autoPath = '/airflow_test/postgresdb_etl_poc_flatfiles/auto' processPath = '/airflow_test/postgresdb_etl_poc_flatfiles/processing' sourceHook = SFTPHook(ftp_conn_id='kub2VM') fileMatches = {} try: directory = sourceHook.describe_directory(path=autoPath) for file in directory.keys(): if re.match(fileRegex, file): fileMatches[file] = directory[file] except Exception as e: logging.error('Error attempting to poll directory {}'.format(autoPath)) raise e print('FileMatches: {}'.format(fileMatches)) for file in fileMatches.keys(): sourceHook.conn.rename(os.path.join(autoPath, file), os.path.join(processPath, file)) dag_params = {} dag_params['fileName'] = file dag_params['processPath'] = processPath triggerConfig = { 'fileName': file, 'processPath': processPath, } trigger_dag(dag_id='ProcessStoreFeed', run_id='trig_{}'.format(timezone.utcnow().isoformat()), conf=json.dumps(triggerConfig), execution_date=None, replace_microseconds=False) logging.info('Triggered DAG Job for File: {}'.format(file)) # Introduce a delay between scheduling dags so there is an order to execution. # I'm worried that if we submit sub-second for multiples that it'll try to run them all at once. time.sleep(10) if sourceHook.conn: sourceHook.close_conn()
def check_for_file_py(**kwargs): path = kwargs.get('path', None) logging.info('path type: {} || path value: {}'.format(type(path), path)) sftp_conn_id = kwargs.get('sftp_conn_id', None) filename = kwargs.get('templates_dict').get('filename', None) sftp_hook = SFTPHook(ftp_conn_id=sftp_conn_id) logging.info('sftp_hook type: {} || sftp_hook value: {}'.format( type(sftp_hook), sftp_hook)) sftp_client = sftp_hook.get_conn() fileList = sftp_hook.list_directory(FILEPATH) logging.info('FileList: {}'.format(fileList)) if FILENAME in fileList: return True else: return False
def sftp_to_pg(**kwargs): today = datetime.date.today().strftime('%y%m%d') conn = SFTPHook('sftp_cityftp') files = conn.describe_directory('/Home/IET/PNC') file_name = [ fn for fn in files.keys() if fn.startswith(f"tls.cityofdetroit.out.{today}") ][0] conn.retrieve_file(f"/Home/IET/PNC/{file_name}", f"/tmp/{file_name}") pg_conn = PostgresHook('etl_postgres') pg_conn.run("truncate table escrow.escrow") pg_conn.run( f"copy escrow.escrow from '/tmp/{file_name}' (FORMAT CSV, HEADER FALSE) " )
def execute(self, context): conn_source = SFTPHook(ftp_conn_id=self.conn_id_source) my_conn_source = conn_source.get_conn() source_file = my_conn_source.sftp_client.file(self.file_source_path, 'r') source_file.seek(self.chunk_number * self.chunk_size) payload = source_file.read(self.chunk_size) client = boto3.client('s3') # aws_access_key_id=self.ACCESS_KEY, # aws_secret_access_key=self.SECRET_KEY, # aws_session_token=self.SESSION_TOKEN) client.upload_part(Body=payload, Bucket=self.bucket, Key=self.key, PartNumber=self.chunk_number, UploadId=self.upload_id)
def execute(self, context): gcs_hook = GoogleCloudStorageHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) sftp_hook = SFTPHook(self.sftp_conn_id) if WILDCARD in self.source_object: total_wildcards = self.source_object.count(WILDCARD) if total_wildcards > 1: raise AirflowException( "Only one wildcard '*' is allowed in source_object parameter. " "Found {} in {}.".format(total_wildcards, self.source_object)) prefix, delimiter = self.source_object.split(WILDCARD, 1) objects = gcs_hook.list(self.source_bucket, prefix=prefix, delimiter=delimiter) for source_object in objects: destination_path = os.path.join(self.destination_path, source_object) self._copy_single_object(gcs_hook, sftp_hook, source_object, destination_path) self.log.info("Done. Uploaded '%d' files to %s", len(objects), self.destination_path) else: destination_path = os.path.join(self.destination_path, self.source_object) self._copy_single_object(gcs_hook, sftp_hook, self.source_object, destination_path) self.log.info("Done. Uploaded '%s' file to %s", self.source_object, destination_path)
class SFTPSensor(BaseSensorOperator): @apply_defaults def __init__(self, filepath, filepattern, sftp_conn_id='sftp_default', *args, **kwargs): super(SFTPSensor, self).__init__(*args, **kwargs) self.filepath = filepath self.filepattern = filepattern self.hook = SFTPHook(sftp_conn_id) def poke(self, context): full_path = self.filepath file_pattern = re.compile(self.filepattern) fileList = [] try: isFound = False directory = self.hook.list_directory(full_path) for files in directory: if not re.match(file_pattern, files): self.log.info(files) self.log.info(file_pattern) else: fileList.append(files) print('I found the file! {}'.format(files)) isFound = True context["task_instance"].xcom_push("file_name", fileList) return isFound except IOError as e: if e.errno != SFTP_NO_SUCH_FILE: raise e return False
def execute(self, context): min_date = FTPSearchOperator._get_date_param(self.min_date or '1900-01-01') max_date = FTPSearchOperator._get_date_param(self.max_date or '2999-12-31') self.log.info("Using ftp connection: %s", self.ftp_conn_id) self.log.info("min date: %s ", min_date.to_datetime_string()) self.log.info("max date: %s ", max_date.to_datetime_string()) if self.ftp_conn_type == 'sftp': self.ftp_hook = SFTPHook(ftp_conn_id=self.ftp_conn_id) elif self.ftp_conn_type == 'ftps': self.ftp_hook = FTPSHook(ftp_conn_id=self.ftp_conn_id) else: self.ftp_hook = FTPHook(ftp_conn_id=self.ftp_conn_id) self.log.info("ftp connection info: %s ", self.ftp_hook.get_connection(self.ftp_conn_id).port) self.log.info("Getting directory listing for %s", self.remote_filepath) file_list = self.get_file_list( ftp_hook=self.ftp_hook, remote_filepath=self.remote_filepath, search_expr=self.search_expr, min_date=min_date, max_date=max_date) if file_list is not None: self.download_files(self.ftp_hook, file_list) else: self.log.info("No files found matching filters in %s", self.remote_filepath) return self.downloaded_files
def execute(self, context): sftp_hook = SFTPHook(ftp_conn_id=self.sftp_conn_id) s3_hook = S3Hook(s3_conn_id=self.s3_conn_id) sftp_hook.get_conn() file_list = sftp_hook.list_directory(self.ftp_folder) if (self.filter): filter(self.filter, file_list) # create tmp directory if not os.path.exists(self.tmp_directory): os.makedirs(self.tmp_directory) for file_name in file_list: s3_key_file = self.s3_key + "/" + str(file_name) exists = s3_hook.check_for_key(s3_key_file, self.s3_bucket) if (exists) and (not self.replace): continue ftp_file_fullpath = self.ftp_folder + "/" + str(file_name) local_file_fullpath = self.tmp_directory + "/" + str(file_name) logging.info("Dowloading file [" + str(ftp_file_fullpath) + "] from sftp to local [" + str(local_file_fullpath) + "]") sftp_hook.get_file(ftp_file_fullpath, local_file_fullpath) logging.info("Done.") logging.info("Uploading file [" + str(local_file_fullpath) + "] to S3 on bucket [" + str(self.s3_bucket) + "] and key [" + str(s3_key_file)+"]") s3_hook.load_file(local_file_fullpath, s3_key_file, self.s3_bucket, self.replace) logging.info("Done.")
class SFTPHookTest(unittest.TestCase): def setUp(self): configuration.load_test_config() self.hook = SFTPHook() os.makedirs(os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS)) with open(os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS), 'a') as f: f.write('Test file') def test_get_conn(self): output = self.hook.get_conn() self.assertEqual(type(output), pysftp.Connection) def test_close_conn(self): self.hook.conn = self.hook.get_conn() self.assertTrue(self.hook.conn is not None) self.hook.close_conn() self.assertTrue(self.hook.conn is None) def test_describe_directory(self): output = self.hook.describe_directory(TMP_PATH) self.assertTrue(TMP_DIR_FOR_TESTS in output) def test_list_directory(self): output = self.hook.list_directory( path=os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS)) self.assertEqual(output, []) def test_create_and_delete_directory(self): new_dir_name = 'new_dir' self.hook.create_directory(os.path.join( TMP_PATH, TMP_DIR_FOR_TESTS, new_dir_name)) output = self.hook.describe_directory( os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS)) self.assertTrue(new_dir_name in output) self.hook.delete_directory(os.path.join( TMP_PATH, TMP_DIR_FOR_TESTS, new_dir_name)) output = self.hook.describe_directory( os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS)) self.assertTrue(new_dir_name not in output) def test_store_retrieve_and_delete_file(self): self.hook.store_file( remote_full_path=os.path.join( TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS), local_full_path=os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS) ) output = self.hook.list_directory( path=os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS)) self.assertEqual(output, [TMP_FILE_FOR_TESTS]) retrieved_file_name = 'retrieved.txt' self.hook.retrieve_file( remote_full_path=os.path.join( TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS), local_full_path=os.path.join(TMP_PATH, retrieved_file_name) ) self.assertTrue(retrieved_file_name in os.listdir(TMP_PATH)) os.remove(os.path.join(TMP_PATH, retrieved_file_name)) self.hook.delete_file(path=os.path.join( TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS)) output = self.hook.list_directory( path=os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS)) self.assertEqual(output, []) def test_get_mod_time(self): self.hook.store_file( remote_full_path=os.path.join( TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS), local_full_path=os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS) ) output = self.hook.get_mod_time(path=os.path.join( TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS)) self.assertEqual(len(output), 14) def tearDown(self): shutil.rmtree(os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS)) os.remove(os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS))
def setUp(self): configuration.load_test_config() self.hook = SFTPHook() os.makedirs(os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS)) with open(os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS), 'a') as f: f.write('Test file')
def __init__(self, path, sftp_conn_id='sftp_default', *args, **kwargs): super().__init__(*args, **kwargs) self.path = path self.hook = SFTPHook(sftp_conn_id)
class SFTPHookTest(unittest.TestCase): def setUp(self): configuration.load_test_config() self.hook = SFTPHook() os.makedirs(os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS)) with open(os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS), 'a') as f: f.write('Test file') def test_get_conn(self): output = self.hook.get_conn() self.assertEqual(type(output), pysftp.Connection) def test_close_conn(self): self.hook.conn = self.hook.get_conn() self.assertTrue(self.hook.conn is not None) self.hook.close_conn() self.assertTrue(self.hook.conn is None) def test_describe_directory(self): output = self.hook.describe_directory(TMP_PATH) self.assertTrue(TMP_DIR_FOR_TESTS in output) def test_list_directory(self): output = self.hook.list_directory( path=os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS)) self.assertEqual(output, []) def test_create_and_delete_directory(self): new_dir_name = 'new_dir' self.hook.create_directory(os.path.join( TMP_PATH, TMP_DIR_FOR_TESTS, new_dir_name)) output = self.hook.describe_directory( os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS)) self.assertTrue(new_dir_name in output) self.hook.delete_directory(os.path.join( TMP_PATH, TMP_DIR_FOR_TESTS, new_dir_name)) output = self.hook.describe_directory( os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS)) self.assertTrue(new_dir_name not in output) def test_store_retrieve_and_delete_file(self): self.hook.store_file( remote_full_path=os.path.join( TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS), local_full_path=os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS) ) output = self.hook.list_directory( path=os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS)) self.assertEqual(output, [TMP_FILE_FOR_TESTS]) retrieved_file_name = 'retrieved.txt' self.hook.retrieve_file( remote_full_path=os.path.join( TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS), local_full_path=os.path.join(TMP_PATH, retrieved_file_name) ) self.assertTrue(retrieved_file_name in os.listdir(TMP_PATH)) os.remove(os.path.join(TMP_PATH, retrieved_file_name)) self.hook.delete_file(path=os.path.join( TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS)) output = self.hook.list_directory( path=os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS)) self.assertEqual(output, []) def test_get_mod_time(self): self.hook.store_file( remote_full_path=os.path.join( TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS), local_full_path=os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS) ) output = self.hook.get_mod_time(path=os.path.join( TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS)) self.assertEqual(len(output), 14) @mock.patch('airflow.contrib.hooks.sftp_hook.SFTPHook.get_connection') def test_no_host_key_check_default(self, get_connection): connection = Connection(login='******', host='host') get_connection.return_value = connection hook = SFTPHook() self.assertEqual(hook.no_host_key_check, False) @mock.patch('airflow.contrib.hooks.sftp_hook.SFTPHook.get_connection') def test_no_host_key_check_enabled(self, get_connection): connection = Connection( login='******', host='host', extra='{"no_host_key_check": true}') get_connection.return_value = connection hook = SFTPHook() self.assertEqual(hook.no_host_key_check, True) @mock.patch('airflow.contrib.hooks.sftp_hook.SFTPHook.get_connection') def test_no_host_key_check_disabled(self, get_connection): connection = Connection( login='******', host='host', extra='{"no_host_key_check": false}') get_connection.return_value = connection hook = SFTPHook() self.assertEqual(hook.no_host_key_check, False) @mock.patch('airflow.contrib.hooks.sftp_hook.SFTPHook.get_connection') def test_no_host_key_check_disabled_for_all_but_true(self, get_connection): connection = Connection( login='******', host='host', extra='{"no_host_key_check": "foo"}') get_connection.return_value = connection hook = SFTPHook() self.assertEqual(hook.no_host_key_check, False) @mock.patch('airflow.contrib.hooks.sftp_hook.SFTPHook.get_connection') def test_no_host_key_check_ignore(self, get_connection): connection = Connection( login='******', host='host', extra='{"ignore_hostkey_verification": true}') get_connection.return_value = connection hook = SFTPHook() self.assertEqual(hook.no_host_key_check, True) @mock.patch('airflow.contrib.hooks.sftp_hook.SFTPHook.get_connection') def test_no_host_key_check_no_ignore(self, get_connection): connection = Connection( login='******', host='host', extra='{"ignore_hostkey_verification": false}') get_connection.return_value = connection hook = SFTPHook() self.assertEqual(hook.no_host_key_check, False) def tearDown(self): shutil.rmtree(os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS)) os.remove(os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS))