def test_conn_with_extra_parameters(self): from airflow.contrib.hooks.ssh_hook import SSHHook db.merge_conn( models.Connection(conn_id='ssh_with_extra', host='localhost', conn_type='ssh', extra='{"compress" : true, "no_host_key_check" : "true"}' ) ) ssh_hook = SSHHook(ssh_conn_id='ssh_with_extra', keepalive_interval=10) ssh_hook.get_conn() self.assertEqual(ssh_hook.compress, True) self.assertEqual(ssh_hook.no_host_key_check, True)
class SSHHookTest(unittest.TestCase): def setUp(self): configuration.load_test_config() from airflow.contrib.hooks.ssh_hook import SSHHook self.hook = SSHHook(ssh_conn_id='ssh_default', keepalive_interval=10) self.hook.no_host_key_check = True def test_ssh_connection(self): ssh_hook = self.hook.get_conn() self.assertIsNotNone(ssh_hook) def test_tunnel(self): print("Setting up remote listener") import subprocess import socket self.server_handle = subprocess.Popen(["python", "-c", HELLO_SERVER_CMD], stdout=subprocess.PIPE) print("Setting up tunnel") with self.hook.create_tunnel(2135, 2134): print("Tunnel up") server_output = self.server_handle.stdout.read(5) self.assertEqual(server_output, b"ready") print("Connecting to server via tunnel") s = socket.socket() s.connect(("localhost", 2135)) print("Receiving...", ) response = s.recv(5) self.assertEqual(response, b"hello") print("Closing connection") s.close() print("Waiting for listener...") output, _ = self.server_handle.communicate() self.assertEqual(self.server_handle.returncode, 0) print("Closing tunnel")
def execute(self, context): try: s3_hook = S3Hook(self.s3_conn_id) logging.info("Connected to S3 hook") except AirflowException as e: logging.info("Error in Connecting to S3 Hook") exit(1) try: ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) logging.info("Connected to SSH Hook") ssh_client = ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() logging.info("Connecting to SFTP") except AirflowException as e: logging.info("Error in Connecting to SFTP") exit(1) try: with NamedTemporaryFile("w") as f: sftp_client.get(self.sftp_path, f.name) s3_hook.load_file(filename=f.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=True) logging.info("SUCCEEDED") except AirflowException as e: logging.info("Transfer to S3 FAILED", str(e)) exit(1)
def execute(self, context): ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) ssh_client = ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() # Get list of files in sftp_path self.log.info(f"Getting list of files in sftp_path: `{self.sftp_folder_path}`") path_content = sftp_client.listdir(self.sftp_folder_path) files = [ file for file in path_content if fnmatch.fnmatch(file, self.sftp_filename) ] sftp_object = None try: if not files: self.log.info( f"No files found in folder that matches `{self.sftp_filename}` parameter." ) for file in files: sftp_object = os.path.join(self.sftp_folder_path, file) sftp_client.remove(path=sftp_object) self.log.info(f"Deleted file `{sftp_object}`") except IOError as ex: # IOError raised by client does not consistently use the same # number of arguments when raised. When a file does not exist # the first argument is the error code `2`. If a folder is # passed then only a text error is used. If a permissions # error occurs then the first argument is error code 13. # # We only want to handle when a file does not exist, all other # exceptions should be reraised to fail the Airflow task. if ex.args[0] == 2: self.log.info(f"File does not exist `{sftp_object}`") else: raise
def runDAP(**kwargs): """ Connects to App Server via SSH and executes script, capturing and reporting output. """ sshSource = SSHHook(ssh_conn_id='DAP_App_Server') command = 'E:\\Airflow_Test\\DAP\DAPConsoleProcessor.exe -config "E:\\Airflow_Test\\DAP\\Configuration\\DAPOrderCancellation.xml" -jobname "DAPOrderCancellation.xml"' try: sshConn = sshSource.get_conn() stdIn, stdOut, stdErr = sshConn.exec_command(command=command) exitStatus = stdOut.channel.recv_exit_status() errorMessage = stdErr.read().decode('ascii') stdOutput = stdOut.read().decode('ascii') if exitStatus == 0: print('DAP Started Successfully.') if errorMessage: logging.error('DAP Processor Failure. See Exception') raise Exception(errorMessage) finally: print('Exit Status: {}'.format(exitStatus)) print('StdOut: {}'.format(stdOutput)) print('StdErr: {}'.format(errorMessage)) if sshConn: sshConn.close()
def execute(self, context): if not self._adls_hook: self._adls_hook = ADLSGen2Hook( container=self.adls_container, azure_data_lake_conn_id=self.azure_data_lake_conn_id, ) ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) ssh_client = ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() # Get list of files in ADLS folder source_files = [ os.path.split(file)[1] # get only the file portion of the path for file in self._get_adls_files() if fnmatch.fnmatch(os.path.split(file)[1], self.source_object) ] self.log.info(f"Source Files: `{source_files}`") # Get list of files in sftp_path try: self.log.info( f"Getting list of files in sftp_path: `{self.sftp_folder_path}`" ) sftp_files = sftp_client.listdir(self.sftp_folder_path) except IOError as e: self.log.error( f"The folder `{self.sftp_folder_path}` does not exist on the sftp server." ) raise e # determine the files to be processed. If all files are to be reloaded # then process all filesin the ADLS folder that match the `source object`. # If all files are not to be reloaded then only process files for which # the file name does not currently exist in the sftp folder if self.reload_all: files_to_process = source_files self.log.info(f"Files to process: `{files_to_process}`") else: self.log.info(f"Existing files in sftp folder: `{sftp_files}`") files_to_process = set(source_files) - set(sftp_files) self.log.info(f"Files to process: `{files_to_process}`") # create temporary folder and process files with tempfile.TemporaryDirectory() as temp_folder: for file in files_to_process: temp_path = os.path.join(temp_folder, file) adls_object = os.path.join(self.adls_folder_path, file) sftp_object = os.path.join(self.sftp_folder_path, file) self.log.info(f"Processing file: `{adls_object}`") self._adls_hook.download_file( local_path=temp_path, remote_path=adls_object, overwrite=True ) sftp_client.put(localpath=temp_path, remotepath=sftp_object) os.remove(temp_path) # Close ADLS Connection self._adls_hook.connection.close()
def check_for_file_py(**kwargs): path = kwargs.get('path', None) sftp_conn_id = kwargs.get('sftp_conn_id', None) #filename = kwargs.get('templates_dict').get('filename', None) ssh_hook = SSHHook(ssh_conn_id=sftp_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() ftp_files = sftp_client.listdir(path) for filename in ftp_files: print(filename) logging.info('Filename: ' + str(filename))
def get_crawler_report() -> str: """Get crawler report.""" ssh = SSHHook(ssh_conn_id='ssh_big_airflow') client = ssh.get_conn() stdin, stdout, stderr = client.exec_command(""" docker exec `docker ps --filter name=bigscrapy_projects_airflow -q` \ sh -c 'cat /bigcrawler-scrapy/summary.txt' """) message = "".join([line for line in stdout.readlines()]) print(f'crawler_report: {message}') return message
def execute(self, context): self.s3_key = self.get_s3_key(self.s3_key) ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) s3_hook = S3Hook(self.s3_conn_id) s3_client = s3_hook.get_conn() sftp_client = ssh_hook.get_conn().open_sftp() with NamedTemporaryFile("w") as f: s3_client.download_file(self.s3_bucket, self.s3_key, f.name) sftp_client.put(f.name, self.sftp_path)
def execute(self, context): self.log.info("Going to start Bulk sftp to s3 operator") sftp_hook = SFTPHook(ftp_conn_id=self.sftp_conn_id) sftp_hook.no_host_key_check = True list_dir = sftp_hook.list_directory(self.sftp_path) if len(list_dir) < 1: self.log.info("Got no files to process. Skipping") return False self.log.info(f"Got {len(list_dir)} files to move") temp_files = [] file_path_list = [] ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() s3_hook = S3Hook(self.aws_conn_id) for file_name in list_dir: file_path = os.path.join(self.sftp_path, file_name) file_path_list.append(file_path) s3_key = str(os.path.join(self.dest_path, file_name)) file_metadata = {"ftp": NamedTemporaryFile("w"), "s3_key": s3_key} for i in range(0, 5): try: self.log.info(f"Downloading {file_path}") sftp_client.get(file_path, file_metadata["ftp"].name) file_metadata["ftp"].flush() temp_files.append(file_metadata) break except Exception: self.log.info( f"Got no response from server, waiting for next try number {(i + 1)}" ) if i < 4: time.sleep(2 ** i + random.random()) sftp_client = ( SSHHook(ssh_conn_id=self.sftp_conn_id) .get_conn() .open_sftp() ) else: raise self.log.info(f"Uploading to S3 with {self.workers} workers") with Pool(self.workers) as pool: pool.starmap( s3_hook.load_file, [ (x["ftp"].name, x["s3_key"], self.dest_bucket, True, False) for x in temp_files ], ) self.log.info("Finished executing Bulk sftp to s3 operator") return file_path_list
def execute(self, context): if not self._adls_hook: self._adls_hook = ADLSGen2Hook( container=self.adls_container, azure_data_lake_conn_id=self.azure_data_lake_conn_id, ) ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) ssh_client = ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() # Get list of files in sftp_path self.log.info(f"Getting list of files in sftp_path: `{self.sftp_folder_path}`") path_content = sftp_client.listdir(self.sftp_folder_path) files = [ file for file in path_content if fnmatch.fnmatch(file, self.sftp_filename) ] # Get files that already exist in the ADLS folder existing_files = self._get_adls_files() # Determine the files to be processed. If all files are to be reloaded then process all files # in the sftp file list. If all files are not to be reloaded then only process files for # which the file name does not currently exist in the ADLS folder if self.reload_all: files_to_process = files else: existing_set = {os.path.split(filename)[1] for filename in existing_files} files_to_process = set(files) - existing_set self.log.info(f"Existing files in ADLS: `{existing_set}`") self.log.info(f"Files to process: `{files_to_process}`") # create temporary folder and process files with tempfile.TemporaryDirectory() as temp_folder: for file in files_to_process: temp_path = os.path.join(temp_folder, file) adls_object = os.path.join(self.adls_folder_path, file) sftp_object = os.path.join(self.sftp_folder_path, file) self.log.info(f"Processing: `{sftp_object}`") try: sftp_client.get(sftp_object, temp_path) self._adls_hook.upload_file( local_path=temp_path, remote_path=adls_object, overwrite=self.reload_all, ) os.remove(temp_path) except IOError: self.log.info(f"Skipping directory `{sftp_object}`.") # Close ADLS Connection self._adls_hook.connection.close()
def execute(self, context): self.s3_key = self.get_s3_key(self.s3_key) ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) s3_hook = S3Hook(self.s3_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() with NamedTemporaryFile("w") as f: sftp_client.get(self.sftp_path, f.name) s3_hook.load_file(filename=f.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=True)
def execute(self, context): ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) gcs_hook = GoogleCloudStorageHook(self.google_cloud_storage_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() with NamedTemporaryFile("w") as f: filename = f.name gcs_hook.download(bucket=self.gcs_bucket, object=self.gcs_dest, filename=filename) file_msg = "from {0} to {1}".format(filename, self.sftp_dest_path) self.log.info("Starting to transfer file %s", file_msg) sftp_client.put(filename, self.sftp_dest_path, confirm=True)
def execute(self, context): ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) s3_hook = S3Hook(self.s3_conn_id) s3_files = s3_hook.list_keys(bucket_name=self.s3_bucket, prefix=self.s3_path) s3_client = s3_hook.get_conn() sftp_client = ssh_hook.get_conn().open_sftp() for key in s3_files: file_name = key.split("/")[-1] with NamedTemporaryFile("w") as f: s3_client.download_file(self.s3_bucket, key, f.name) sftp_client.put(f.name, os.path.join(self.sftp_path, file_name))
def execute(self, context): self.s3_key = self.get_s3_key(self.s3_key) ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) s3_hook = S3Hook(self.s3_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() with NamedTemporaryFile("w") as f: sftp_client.get(self.sftp_path, f.name) s3_hook.load_file( filename=f.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=True )
def test_ssh_connection_without_password(self, ssh_mock): hook = SSHHook(remote_host='remote_host', port='port', username='******', timeout=10, key_file='fake.file') with hook.get_conn(): ssh_mock.return_value.connect.assert_called_once_with( hostname='remote_host', username='******', key_filename='fake.file', timeout=10, compress=True, port='port', sock=None)
def test_ssh_connection_without_password(self, ssh_mock): hook = SSHHook(remote_host='remote_host', port='port', username='******', timeout=10, key_file='fake.file') with hook.get_conn(): ssh_mock.return_value.connect.assert_called_once_with( hostname='remote_host', username='******', key_filename='fake.file', timeout=10, compress=True, port='port', sock=None )
def test_ssh_connection_with_private_key_extra(self, ssh_mock): hook = SSHHook( ssh_conn_id=self.CONN_SSH_WITH_PRIVATE_KEY_EXTRA, remote_host='remote_host', port='port', username='******', timeout=10, ) with hook.get_conn(): ssh_mock.return_value.connect.assert_called_once_with( hostname='remote_host', username='******', pkey=TEST_PKEY, timeout=10, compress=True, port='port', sock=None)
def GetFiles(**kwargs): """ this function downloads the files from the source host and writes it into the DB """ ftp = FTPHook(ftp_conn_id=af_conn_id) #create a list from all files on the destination what ends with .csv files = [x for x in ftp.list_directory(source) if str(x).endswith('.csv')] #ftp.close_conn() for file in files: data_dict = upload_data(ftp, file) for filename in data_dict: df = pd.read_csv(StringIO(data_dict[filename])) #based on the file names the destination table has to be set if filename.startswith("location"): table = "location_details" #here data modifications could aply #df['UPDATE_DATE'] = pd.to_datetime(df['UPDATE_DATE'], format='%Y%m%d') if filename.startswith("product"): table = "product_details" #db = create_engine(get_postgre_connection(postgre_conn_id)) #db_conn = db.connect() #try: #df.to_sql(name=table, con=db_conn, schema='public', if_exists='append', index=False) #except Exception as error: #print("An exception occurred:", error) #db_conn.close() ssh_hook = SSHHook(af_conn_id) s3_hook = S3Hook(s3_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() with NamedTemporaryFile("w") as f: sftp_client.get(self.sftp_path, f.name) s3_hook.load_file(filename=f.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=True)
def execute(self, context): try: s3_hook = S3Hook(self.s3_conn_id) logging.info("Connected to S3 hook") except AirflowException as e: logging.info("Error in Connecting to S3 Hook") exit(1) try: ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) logging.info("Connected to SSH Hook") ssh_client = ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() logging.info("Connected to SFTP") except AirflowException as e: logging.info("Error in Connecting to SFTP") exit(1) try: with NamedTemporaryFile("w") as f: logging.info("Connecting to SFTP and using STAT to get properties of file.") ftpfilesizestat = sftp_client.stat(self.sftp_path) logging.info("Connected") ftpfilesize = ftpfilesizestat.st_size logging.info("size of current file in SFTP is calculated and size is " +str(ftpfilesize)) logging.info("generating key name in s3") keyname = s3_hook.get_key(self.s3_key, bucket_name=self.s3_bucket) logging.info("key name is generated and calculating size of object in s3") awsfilesize = keyname.get()["ContentLength"] logging.info("size of the object in s3 is calculated and size is " +str(awsfilesize)) if ftpfilesize == awsfilesize: logging.info("file sizes are equal") else: logging.info("file sizes are not equal") logging.info("SUCCEEDED") except AirflowException as e: logging.info("Transfer to S3 FAILED", str(e)) exit(1)
def sshoperator_remote(ssh_conn_id, ssh_hook=None, remote_host=None, command=None, timeout=10, do_xcom_push=None, **kwargs): if ssh_conn_id and not ssh_hook: ssh_hook = SSHHook(ssh_conn_id=ssh_conn_id) get_pty = False if command.startswith('sudo'): get_pty = True kwargs['local_path'] ssh_client = ssh_hook.get_conn() # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command(command=command, get_pty=get_pty, timeout=timeout) # get channels channel = stdout.channel # closing stdin stdin.close() channel.shutdown_write() agg_stdout = b'' agg_stderr = b'' # capture any initial output in case channel is closed already stdout_buffer_length = len(stdout.channel.in_buffer) if stdout_buffer_length > 0: agg_stdout += stdout.channel.recv(stdout_buffer_length) # read from both stdout and stderr while not channel.closed or \ channel.recv_ready() or \ channel.recv_stderr_ready(): readq, _, _ = select([channel], [], [], timeout) for c in readq: if c.recv_ready(): line = stdout.channel.recv(len(c.in_buffer)) line = line agg_stdout += line log.info(line.decode('utf-8').strip('\n')) if c.recv_stderr_ready(): line = stderr.channel.recv_stderr(len(c.in_stderr_buffer)) line = line agg_stderr += line log.warning(line.decode('utf-8').strip('\n')) if stdout.channel.exit_status_ready() \ and not stderr.channel.recv_stderr_ready() \ and not stdout.channel.recv_ready(): stdout.channel.shutdown_read() stdout.channel.close() break stdout.close() stderr.close() return agg_stdout
class SFTPOperator(BaseOperator): """ SFTPOperator for transferring files from remote host to local or vice a versa. This operator uses ssh_hook to open sftp trasport channel that serve as basis for file transfer. :param ssh_hook: predefined ssh_hook to use for remote execution :type ssh_hook: :class:`SSHHook` :param ssh_conn_id: connection id from airflow Connections :type ssh_conn_id: str :param remote_host: remote host to connect :type remote_host: str :param local_filepath: local file path to get or put :type local_filepath: str :param remote_filepath: remote file path to get or put :type remote_filepath: str :param operation: specify operation 'get' or 'put', defaults to get :type get: bool """ template_fields = ('local_filepath', 'remote_filepath') @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, local_filepath=None, remote_filepath=None, operation=SFTPOperation.PUT, *args, **kwargs): super(SFTPOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.local_filepath = local_filepath self.remote_filepath = remote_filepath self.operation = operation if not (self.operation.lower() == SFTPOperation.GET or self.operation.lower() == SFTPOperation.PUT): raise TypeError("unsupported operation value {0}, expected {1} or {2}" .format(self.operation, SFTPOperation.GET, SFTPOperation.PUT)) def execute(self, context): file_msg = None try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("can not operate without ssh_hook or ssh_conn_id") if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host ssh_client = self.ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() if self.operation.lower() == SFTPOperation.GET: file_msg = "from {0} to {1}".format(self.remote_filepath, self.local_filepath) logging.debug("Starting to transfer {0}".format(file_msg)) sftp_client.get(self.remote_filepath, self.local_filepath) else: file_msg = "from {0} to {1}".format(self.local_filepath, self.remote_filepath) logging.debug("Starting to transfer file {0}".format(file_msg)) sftp_client.put(self.local_filepath, self.remote_filepath) except Exception as e: raise AirflowException("Error while transferring {0}, error: {1}" .format(file_msg, str(e))) return None
class SSHOperator(BaseOperator): """ SSHOperator to execute commands on given remote host using the ssh_hook. :param ssh_hook: predefined ssh_hook to use for remote execution. Either `ssh_hook` or `ssh_conn_id` needs to be provided. :type ssh_hook: airflow.contrib.hooks.ssh_hook.SSHHook :param ssh_conn_id: connection id from airflow Connections. `ssh_conn_id` will be ignored if `ssh_hook` is provided. :type ssh_conn_id: str :param remote_host: remote host to connect (templated) Nullable. If provided, it will replace the `remote_host` which was defined in `ssh_hook` or predefined in the connection of `ssh_conn_id`. :type remote_host: str :param command: command to execute on remote host. (templated) :type command: str :param timeout: timeout (in seconds) for executing the command. :type timeout: int :param environment: a dict of shell environment variables. Note that the server will reject them silently if `AcceptEnv` is not set in SSH config. :type environment: dict :param get_pty: request a pseudo-terminal from the server. Set to ``True`` to have the remote process killed upon task timeout. The default is ``False`` but note that `get_pty` is forced to ``True`` when the `command` starts with ``sudo``. :type get_pty: bool """ template_fields = ('command', 'remote_host') template_ext = ('.sh',) @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, command=None, timeout=10, environment=None, get_pty=False, *args, **kwargs): super().__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.command = command self.timeout = timeout self.environment = environment self.get_pty = self.command.startswith('sudo') or get_pty def execute(self, context): try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info("ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id, timeout=self.timeout) if not self.ssh_hook: raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: self.log.info("remote_host is provided explicitly. " + "It will replace the remote_host which was defined " + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host if not self.command: raise AirflowException("SSH command not specified. Aborting.") with self.ssh_hook.get_conn() as ssh_client: self.log.info("Running command: %s", self.command) # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command(command=self.command, get_pty=self.get_pty, timeout=self.timeout, environment=self.environment ) # get channels channel = stdout.channel # closing stdin stdin.close() channel.shutdown_write() agg_stdout = b'' agg_stderr = b'' # capture any initial output in case channel is closed already stdout_buffer_length = len(stdout.channel.in_buffer) if stdout_buffer_length > 0: agg_stdout += stdout.channel.recv(stdout_buffer_length) # read from both stdout and stderr while not channel.closed or \ channel.recv_ready() or \ channel.recv_stderr_ready(): readq, _, _ = select([channel], [], [], self.timeout) for c in readq: if c.recv_ready(): line = stdout.channel.recv(len(c.in_buffer)) line = line agg_stdout += line self.log.info(line.decode('utf-8').strip('\n')) if c.recv_stderr_ready(): line = stderr.channel.recv_stderr(len(c.in_stderr_buffer)) line = line agg_stderr += line self.log.warning(line.decode('utf-8').strip('\n')) if stdout.channel.exit_status_ready()\ and not stderr.channel.recv_stderr_ready()\ and not stdout.channel.recv_ready(): stdout.channel.shutdown_read() stdout.channel.close() break stdout.close() stderr.close() exit_status = stdout.channel.recv_exit_status() if exit_status == 0: enable_pickling = conf.getboolean( 'core', 'enable_xcom_pickling' ) if enable_pickling: return agg_stdout else: return b64encode(agg_stdout).decode('utf-8') else: error_msg = agg_stderr.decode('utf-8') raise AirflowException("error running cmd: {0}, error: {1}" .format(self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) return True def tunnel(self): ssh_client = self.ssh_hook.get_conn() ssh_client.get_transport()
def test_ssh_connection(self): hook = SSHHook(ssh_conn_id='ssh_default') with hook.get_conn() as client: (_, stdout, _) = client.exec_command('ls') self.assertIsNotNone(stdout.read())
class SFTPOperator(BaseOperator): """ SFTPOperator for transferring files from remote host to local or vice a versa. This operator uses ssh_hook to open sftp transport channel that serve as basis for file transfer. :param ssh_hook: predefined ssh_hook to use for remote execution. Either `ssh_hook` or `ssh_conn_id` needs to be provided. :type ssh_hook: airflow.contrib.hooks.ssh_hook.SSHHook :param ssh_conn_id: connection id from airflow Connections. `ssh_conn_id` will be ignored if `ssh_hook` is provided. :type ssh_conn_id: str :param remote_host: remote host to connect (templated) Nullable. If provided, it will replace the `remote_host` which was defined in `ssh_hook` or predefined in the connection of `ssh_conn_id`. :type remote_host: str :param local_filepath: local file path to get or put. (templated) :type local_filepath: str :param remote_filepath: remote file path to get or put. (templated) :type remote_filepath: str :param operation: specify operation 'get' or 'put', defaults to put :type operation: str :param confirm: specify if the SFTP operation should be confirmed, defaults to True :type confirm: bool :param create_intermediate_dirs: create missing intermediate directories when copying from remote to local and vice-versa. Default is False. Example: The following task would copy ``file.txt`` to the remote host at ``/tmp/tmp1/tmp2/`` while creating ``tmp``,``tmp1`` and ``tmp2`` if they don't exist. If the parameter is not passed it would error as the directory does not exist. :: put_file = SFTPOperator( task_id="test_sftp", ssh_conn_id="ssh_default", local_filepath="/tmp/file.txt", remote_filepath="/tmp/tmp1/tmp2/file.txt", operation="put", create_intermediate_dirs=True, dag=dag ) :type create_intermediate_dirs: bool """ template_fields = ('local_filepath', 'remote_filepath', 'remote_host') @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, local_filepath=None, remote_filepath=None, operation=SFTPOperation.PUT, confirm=True, create_intermediate_dirs=False, *args, **kwargs): super(SFTPOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.local_filepath = local_filepath self.remote_filepath = remote_filepath self.operation = operation self.confirm = confirm self.create_intermediate_dirs = create_intermediate_dirs if not (self.operation.lower() == SFTPOperation.GET or self.operation.lower() == SFTPOperation.PUT): raise TypeError("unsupported operation value {0}, expected {1} or {2}" .format(self.operation, SFTPOperation.GET, SFTPOperation.PUT)) def execute(self, context): file_msg = None try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info("ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: self.log.info("remote_host is provided explicitly. " + "It will replace the remote_host which was defined " + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host with self.ssh_hook.get_conn() as ssh_client: sftp_client = ssh_client.open_sftp() if self.operation.lower() == SFTPOperation.GET: local_folder = os.path.dirname(self.local_filepath) if self.create_intermediate_dirs: # Create Intermediate Directories if it doesn't exist try: os.makedirs(local_folder) except OSError: if not os.path.isdir(local_folder): raise file_msg = "from {0} to {1}".format(self.remote_filepath, self.local_filepath) self.log.info("Starting to transfer %s", file_msg) sftp_client.get(self.remote_filepath, self.local_filepath) else: remote_folder = os.path.dirname(self.remote_filepath) if self.create_intermediate_dirs: _make_intermediate_dirs( sftp_client=sftp_client, remote_directory=remote_folder, ) file_msg = "from {0} to {1}".format(self.local_filepath, self.remote_filepath) self.log.info("Starting to transfer file %s", file_msg) sftp_client.put(self.local_filepath, self.remote_filepath, confirm=self.confirm) except Exception as e: raise AirflowException("Error while transferring {0}, error: {1}" .format(file_msg, str(e))) return self.local_filepath
class TemplateToSFTPOperator(BaseOperator): """ TemplateToSFTPOperator is for uploading a file to a remote server based on a template. It takes many of the same params as SFTPOperator. :param str ssh_conn_id: connection id from airflow Connections. `ssh_conn_id` will be ignored if `ssh_hook` is provided. :param bool create_intermediate_dirs: create missing intermediate directories when copying from remote to local and vice-versa. Default is False. :param int file_mode: permissions to set on the remote file. eg 0o644 or 0o755 :param file_contents: contents to upload into the file (templated) :param remote_filepath: remote file path to get or put. (templated) """ template_fields = ('file_contents', 'remote_filepath') template_ext = ('jinja2', ) @apply_defaults def __init__(self, ssh_conn_id=None, ssh_hook=None, file_mode=None, file_contents='', remote_filepath=None, create_intermediate_dirs=True, *args, **kwargs): super().__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.file_mode = file_mode self.file_contents = file_contents self.remote_filepath = remote_filepath self.create_intermediate_dirs = create_intermediate_dirs def execute(self, context): try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info( "ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException( "Cannot operate without ssh_hook or ssh_conn_id.") with self.ssh_hook.get_conn() as ssh_client: sftp_client = ssh_client.open_sftp() remote_folder = os.path.dirname(self.remote_filepath) if self.create_intermediate_dirs: _make_intermediate_dirs( sftp_client=sftp_client, remote_directory=remote_folder, ) self.log.info("Starting to transfer file to %s", self.remote_filepath) file_contents_fo = StringIO(self.file_contents) sftp_client.putfo(file_contents_fo, self.remote_filepath) if self.file_mode is not None: sftp_client.chmod(self.remote_filepath, self.file_mode) except Exception as e: raise AirflowException( "Error while uploading to {0}, error: {1}".format( self.remote_filepath, str(e))) return self.remote_filepath
class SSHOperator(BaseOperator): """ SSHOperator to execute commands on given remote host using the ssh_hook. :param ssh_hook: predefined ssh_hook to use for remote execution. Either `ssh_hook` or `ssh_conn_id` needs to be provided. :type ssh_hook: :class:`SSHHook` :param ssh_conn_id: connection id from airflow Connections. `ssh_conn_id` will be ignored if `ssh_hook` is provided. :type ssh_conn_id: str :param remote_host: remote host to connect (templated) Nullable. If provided, it will replace the `remote_host` which was defined in `ssh_hook` or predefined in the connection of `ssh_conn_id`. :type remote_host: str :param command: command to execute on remote host. (templated) :type command: str :param timeout: timeout (in seconds) for executing the command. :type timeout: int :param do_xcom_push: return the stdout which also get set in xcom by airflow platform :type do_xcom_push: bool """ template_fields = ('command', 'remote_host') template_ext = ('.sh',) @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, command=None, timeout=10, do_xcom_push=False, *args, **kwargs): super(SSHOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.command = command self.timeout = timeout self.do_xcom_push = do_xcom_push def execute(self, context): try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info("ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id, timeout=self.timeout) if not self.ssh_hook: raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: self.log.info("remote_host is provided explicitly. " + "It will replace the remote_host which was defined " + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host if not self.command: raise AirflowException("SSH command not specified. Aborting.") with self.ssh_hook.get_conn() as ssh_client: # Auto apply tty when its required in case of sudo get_pty = False if self.command.startswith('sudo'): get_pty = True # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command(command=self.command, get_pty=get_pty, timeout=self.timeout ) # get channels channel = stdout.channel # closing stdin stdin.close() channel.shutdown_write() agg_stdout = b'' agg_stderr = b'' # capture any initial output in case channel is closed already stdout_buffer_length = len(stdout.channel.in_buffer) if stdout_buffer_length > 0: agg_stdout += stdout.channel.recv(stdout_buffer_length) # read from both stdout and stderr while not channel.closed or \ channel.recv_ready() or \ channel.recv_stderr_ready(): readq, _, _ = select([channel], [], [], self.timeout) for c in readq: if c.recv_ready(): line = stdout.channel.recv(len(c.in_buffer)) line = line agg_stdout += line self.log.info(line.decode('utf-8').strip('\n')) if c.recv_stderr_ready(): line = stderr.channel.recv_stderr(len(c.in_stderr_buffer)) line = line agg_stderr += line self.log.warning(line.decode('utf-8').strip('\n')) if stdout.channel.exit_status_ready()\ and not stderr.channel.recv_stderr_ready()\ and not stdout.channel.recv_ready(): stdout.channel.shutdown_read() stdout.channel.close() break stdout.close() stderr.close() exit_status = stdout.channel.recv_exit_status() if exit_status is 0: # returning output if do_xcom_push is set if self.do_xcom_push: enable_pickling = configuration.conf.getboolean( 'core', 'enable_xcom_pickling' ) if enable_pickling: return agg_stdout else: return b64encode(agg_stdout).decode('utf-8') else: error_msg = agg_stderr.decode('utf-8') raise AirflowException("error running cmd: {0}, error: {1}" .format(self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) return True def tunnel(self): ssh_client = self.ssh_hook.get_conn() ssh_client.get_transport()
def test_ssh_connection(self): hook = SSHHook(ssh_conn_id='ssh_default') with hook.get_conn() as client: # Note - Pylint will fail with no-member here due to https://github.com/PyCQA/pylint/issues/1437 (_, stdout, _) = client.exec_command('ls') # pylint: disable=no-member self.assertIsNotNone(stdout.read())
class SSHAlteryxOperator(BaseOperator): """ Modified SSHOperator SSHAlteryxOperator to execute Alteryx commands on given remote host using the ssh_hook. :param ssh_hook: predefined ssh_hook to use for remote execution. Either `ssh_hook` or `ssh_conn_id` needs to be provided. :type ssh_hook: airflow.contrib.hooks.ssh_hook.SSHHook :param ssh_conn_id: connection id from airflow Connections. `ssh_conn_id` will be ignored if `ssh_hook` is provided. :type ssh_conn_id: str :param remote_host: remote host to connect (templated) Nullable. If provided, it will replace the `remote_host` which was defined in `ssh_hook` or predefined in the connection of `ssh_conn_id`. :type remote_host: str :param command: command to execute on remote host. (templated) :type command: str :param timeout: timeout (in seconds) for executing the command. :type timeout: int """ template_fields = ('command', 'remote_host') template_ext = ('.sh', ) @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, command=None, timeout=180, *args, **kwargs): super(SSHAlteryxOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.command = command self.timeout = timeout def execute(self, context): if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info( "ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id, timeout=self.timeout) if not self.ssh_hook: raise AirflowException( "Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: self.log.info( "remote_host is provided explicitly. " + "It will replace the remote_host which was defined " + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host if not self.command: raise AirflowException("SSH command not specified. Aborting.") with self.ssh_hook.get_conn() as ssh_client: # Auto apply tty when its required in case of sudo get_pty = False if self.command.startswith('sudo'): get_pty = True self.log.info("Running command: %s", self.command) # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command( command=self.command, get_pty=get_pty, timeout=self.timeout) # fix the encoding stdin._set_mode('b') stdout._set_mode('b') stderr._set_mode('b') agg_stdout = stdout.read().decode('utf-8', errors='ignore') agg_stderr = stderr.read().decode('utf-8', errors='ignore') #close the cons stdin.close() stdout.close() stderr.close() #errirs error = re.findall(r"(Error - .*\n)", agg_stdout) try: error = ' '.join(error) except: error = 'No Error' #log it if re.search(r'([\S\s]*?seconds with [0-9]* error)', agg_stdout.replace('.', '').replace('\n', '')) == None: enable_pickling = configuration.conf.getboolean( 'core', 'enable_xcom_pickling') if enable_pickling: print(agg_stdout) return agg_stdout else: print( b64encode(agg_stdout).decode('UTF-8', errors='ignore')) return b64encode(agg_stdout).decode('UTF-8', errors='ignore') else: print(agg_stdout, '\n\nWorkflow error triggered by Alteryx') raise AirflowException("Workflow Error") return True def tunnel(self): ssh_client = self.ssh_hook.get_conn() ssh_client.get_transport()
class SSHOperator(BaseOperator): """ SSHOperator to execute commands on given remote host using the ssh_hook. :param ssh_hook: predefined ssh_hook to use for remote execution :type ssh_hook: :class:`SSHHook` :param ssh_conn_id: connection id from airflow Connections :type ssh_conn_id: str :param remote_host: remote host to connect :type remote_host: str :param command: command to execute on remote host :type command: str :param timeout: timeout for executing the command. :type timeout: int :param do_xcom_push: return the stdout which also get set in xcom by airflow platform :type do_xcom_push: bool """ template_fields = ('command', ) @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, command=None, timeout=10, do_xcom_push=False, *args, **kwargs): super(SSHOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.command = command self.timeout = timeout self.do_xcom_push = do_xcom_push def execute(self, context): try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException( "can not operate without ssh_hook or ssh_conn_id") if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host ssh_client = self.ssh_hook.get_conn() if not self.command: raise AirflowException( "no command specified so nothing to execute here.") # Auto apply tty when its required in case of sudo get_pty = False if self.command.startswith('sudo'): get_pty = True # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command( command=self.command, get_pty=get_pty, timeout=self.timeout) stdin.close() output = b'' for line in stdout: output += line.encode('utf-8') self.log.info(line.strip('\n')) exit_status = stdout.channel.recv_exit_status() if exit_status is 0: # only returning on output if do_xcom_push is set # otherwise its not suppose to be disclosed if self.do_xcom_push: enable_pickling = configuration.getboolean( 'core', 'enable_xcom_pickling') if enable_pickling: return output else: return b64encode(output).decode('utf-8') else: error_msg = stderr.read() raise AirflowException( "error running cmd: {0}, error: {1}".format( self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) return True def tunnel(self): ssh_client = self.ssh_hook.get_conn() ssh_client.get_transport()
class SSHOperator(BaseOperator): """ SSHOperator to execute commands on given remote host using the ssh_hook. :param ssh_hook: predefined ssh_hook to use for remote execution :type ssh_hook: :class:`SSHHook` :param ssh_conn_id: connection id from airflow Connections :type ssh_conn_id: str :param remote_host: remote host to connect :type remote_host: str :param command: command to execute on remote host :type command: str :param timeout: timeout (in seconds) for executing the command. :type timeout: int :param do_xcom_push: return the stdout which also get set in xcom by airflow platform :type do_xcom_push: bool """ template_fields = ('command',) @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, command=None, timeout=10, do_xcom_push=False, *args, **kwargs): super(SSHOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.command = command self.timeout = timeout self.do_xcom_push = do_xcom_push def execute(self, context): try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("can not operate without ssh_hook or ssh_conn_id") if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host ssh_client = self.ssh_hook.get_conn() if not self.command: raise AirflowException("no command specified so nothing to execute here.") # Auto apply tty when its required in case of sudo get_pty = False if self.command.startswith('sudo'): get_pty = True # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command(command=self.command, get_pty=get_pty, timeout=self.timeout ) # get channels channel = stdout.channel # closing stdin stdin.close() channel.shutdown_write() agg_stdout = b'' agg_stderr = b'' # capture any initial output in case channel is closed already stdout_buffer_length = len(stdout.channel.in_buffer) if stdout_buffer_length > 0: agg_stdout += stdout.channel.recv(stdout_buffer_length) # read from both stdout and stderr while not channel.closed or channel.recv_ready() or channel.recv_stderr_ready(): readq, _, _ = select([channel], [], [], self.timeout) for c in readq: if c.recv_ready(): line = stdout.channel.recv(len(c.in_buffer)) line = line agg_stdout += line self.log.info(line.decode('utf-8').strip('\n')) if c.recv_stderr_ready(): line = stderr.channel.recv_stderr(len(c.in_stderr_buffer)) line = line agg_stderr += line self.log.warning(line.decode('utf-8').strip('\n')) if stdout.channel.exit_status_ready()\ and not stderr.channel.recv_stderr_ready()\ and not stdout.channel.recv_ready(): stdout.channel.shutdown_read() stdout.channel.close() break stdout.close() stderr.close() exit_status = stdout.channel.recv_exit_status() if exit_status is 0: # returning output if do_xcom_push is set if self.do_xcom_push: enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling') if enable_pickling: return agg_stdout else: return b64encode(agg_stdout).decode('utf-8') else: error_msg = agg_stderr.decode('utf-8') raise AirflowException("error running cmd: {0}, error: {1}" .format(self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) return True def tunnel(self): ssh_client = self.ssh_hook.get_conn() ssh_client.get_transport()
class SFTPOperator(BaseOperator): """ SFTPOperator for transferring files from remote host to local or vice a versa. This operator uses ssh_hook to open sftp trasport channel that serve as basis for file transfer. :param ssh_hook: predefined ssh_hook to use for remote execution. Either `ssh_hook` or `ssh_conn_id` needs to be provided. :type ssh_hook: :class:`SSHHook` :param ssh_conn_id: connection id from airflow Connections. `ssh_conn_id` will be ingored if `ssh_hook` is provided. :type ssh_conn_id: str :param remote_host: remote host to connect (templated) Nullable. If provided, it will replace the `remote_host` which was defined in `ssh_hook` or predefined in the connection of `ssh_conn_id`. :type remote_host: str :param local_filepath: local file path to get or put. (templated) :type local_filepath: str :param remote_filepath: remote file path to get or put. (templated) :type remote_filepath: str :param operation: specify operation 'get' or 'put', defaults to put :type get: bool :param confirm: specify if the SFTP operation should be confirmed, defaults to True :type confirm: bool """ template_fields = ('local_filepath', 'remote_filepath', 'remote_host') @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, local_filepath=None, remote_filepath=None, operation=SFTPOperation.PUT, confirm=True, *args, **kwargs): super(SFTPOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.local_filepath = local_filepath self.remote_filepath = remote_filepath self.operation = operation self.confirm = confirm if not (self.operation.lower() == SFTPOperation.GET or self.operation.lower() == SFTPOperation.PUT): raise TypeError("unsupported operation value {0}, expected {1} or {2}" .format(self.operation, SFTPOperation.GET, SFTPOperation.PUT)) def execute(self, context): file_msg = None try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info("ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: self.log.info("remote_host is provided explicitly. " + "It will replace the remote_host which was defined " + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host with self.ssh_hook.get_conn() as ssh_client: sftp_client = ssh_client.open_sftp() if self.operation.lower() == SFTPOperation.GET: file_msg = "from {0} to {1}".format(self.remote_filepath, self.local_filepath) self.log.debug("Starting to transfer %s", file_msg) sftp_client.get(self.remote_filepath, self.local_filepath) else: file_msg = "from {0} to {1}".format(self.local_filepath, self.remote_filepath) self.log.debug("Starting to transfer file %s", file_msg) sftp_client.put(self.local_filepath, self.remote_filepath, confirm=self.confirm) except Exception as e: raise AirflowException("Error while transferring {0}, error: {1}" .format(file_msg, str(e))) return None
class SFTPOperator(BaseOperator): """ SFTPOperator for transferring files from remote host to local or vice a versa. This operator uses ssh_hook to open sftp trasport channel that serve as basis for file transfer. :param ssh_hook: predefined ssh_hook to use for remote execution :type ssh_hook: :class:`SSHHook` :param ssh_conn_id: connection id from airflow Connections :type ssh_conn_id: str :param remote_host: remote host to connect :type remote_host: str :param local_filepath: local file path to get or put. (templated) :type local_filepath: str :param remote_filepath: remote file path to get or put. (templated) :type remote_filepath: str :param operation: specify operation 'get' or 'put', defaults to put :type get: bool :param confirm: specify if the SFTP operation should be confirmed, defaults to True :type confirm: bool """ template_fields = ('local_filepath', 'remote_filepath') @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, local_filepath=None, remote_filepath=None, operation=SFTPOperation.PUT, confirm=True, *args, **kwargs): super(SFTPOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.local_filepath = local_filepath self.remote_filepath = remote_filepath self.operation = operation self.confirm = confirm if not (self.operation.lower() == SFTPOperation.GET or self.operation.lower() == SFTPOperation.PUT): raise TypeError( "unsupported operation value {0}, expected {1} or {2}".format( self.operation, SFTPOperation.GET, SFTPOperation.PUT)) def execute(self, context): file_msg = None try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException( "can not operate without ssh_hook or ssh_conn_id") if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host ssh_client = self.ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() if self.operation.lower() == SFTPOperation.GET: file_msg = "from {0} to {1}".format(self.remote_filepath, self.local_filepath) self.log.debug("Starting to transfer %s", file_msg) sftp_client.get(self.remote_filepath, self.local_filepath) else: file_msg = "from {0} to {1}".format(self.local_filepath, self.remote_filepath) self.log.debug("Starting to transfer file %s", file_msg) sftp_client.put(self.local_filepath, self.remote_filepath, confirm=self.confirm) except Exception as e: raise AirflowException( "Error while transferring {0}, error: {1}".format( file_msg, str(e))) return None
class SFTPOperator(BaseOperator): """ SFTPOperator for transferring files from remote host to local or vice a versa. This operator uses ssh_hook to open sftp transport channel that serve as basis for file transfer. :param ssh_hook: predefined ssh_hook to use for remote execution. Either `ssh_hook` or `ssh_conn_id` needs to be provided. :type ssh_hook: :class:`SSHHook` :param ssh_conn_id: connection id from airflow Connections. `ssh_conn_id` will be ingored if `ssh_hook` is provided. :type ssh_conn_id: str :param remote_host: remote host to connect (templated) Nullable. If provided, it will replace the `remote_host` which was defined in `ssh_hook` or predefined in the connection of `ssh_conn_id`. :type remote_host: str :param local_filepath: local file path to get or put. (templated) :type local_filepath: str :param remote_filepath: remote file path to get or put. (templated) :type remote_filepath: str :param operation: specify operation 'get' or 'put', defaults to put :type operation: str :param confirm: specify if the SFTP operation should be confirmed, defaults to True :type confirm: bool :param create_intermediate_dirs: create missing intermediate directories when copying from remote to local and vice-versa. Default is False. Example: The following task would copy ``file.txt`` to the remote host at ``/tmp/tmp1/tmp2/`` while creating ``tmp``,``tmp1`` and ``tmp2`` if they don't exist. If the parameter is not passed it would error as the directory does not exist. :: put_file = SFTPOperator( task_id="test_sftp", ssh_conn="ssh_default", local_filepath="/tmp/file.txt", remote_filepath="/tmp/tmp1/tmp2/file.txt", operation="put", create_intermediate_dirs=True, dag=dag ) :type create_intermediate_dirs: bool """ template_fields = ('local_filepath', 'remote_filepath', 'remote_host') @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, local_filepath=None, remote_filepath=None, operation=SFTPOperation.PUT, confirm=True, create_intermediate_dirs=False, *args, **kwargs): super(SFTPOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.local_filepath = local_filepath self.remote_filepath = remote_filepath self.operation = operation self.confirm = confirm self.create_intermediate_dirs = create_intermediate_dirs if not (self.operation.lower() == SFTPOperation.GET or self.operation.lower() == SFTPOperation.PUT): raise TypeError( "unsupported operation value {0}, expected {1} or {2}".format( self.operation, SFTPOperation.GET, SFTPOperation.PUT)) def execute(self, context): file_msg = None try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info( "ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException( "Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: self.log.info( "remote_host is provided explicitly. " + "It will replace the remote_host which was defined " + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host with self.ssh_hook.get_conn() as ssh_client: sftp_client = ssh_client.open_sftp() if self.operation.lower() == SFTPOperation.GET: local_folder = os.path.dirname(self.local_filepath) if self.create_intermediate_dirs: # Create Intermediate Directories if it doesn't exist try: os.makedirs(local_folder) except OSError: if not os.path.isdir(local_folder): raise file_msg = "from {0} to {1}".format( self.remote_filepath, self.local_filepath) self.log.debug("Starting to transfer %s", file_msg) sftp_client.get(self.remote_filepath, self.local_filepath) else: remote_folder = os.path.dirname(self.remote_filepath) if self.create_intermediate_dirs: _make_intermediate_dirs( sftp_client=sftp_client, remote_directory=remote_folder, ) file_msg = "from {0} to {1}".format( self.local_filepath, self.remote_filepath) self.log.debug("Starting to transfer file %s", file_msg) sftp_client.put(self.local_filepath, self.remote_filepath, confirm=self.confirm) except Exception as e: raise AirflowException( "Error while transferring {0}, error: {1}".format( file_msg, str(e))) return None
class SSHOperator(BaseOperator): """ SSHOperator to execute commands on given remote host using the ssh_hook. :param ssh_hook: predefined ssh_hook to use for remote execution. Either `ssh_hook` or `ssh_conn_id` needs to be provided. :type ssh_hook: airflow.contrib.hooks.ssh_hook.SSHHook :param ssh_conn_id: connection id from airflow Connections. `ssh_conn_id` will be ignored if `ssh_hook` is provided. :type ssh_conn_id: str :param remote_host: remote host to connect (templated) Nullable. If provided, it will replace the `remote_host` which was defined in `ssh_hook` or predefined in the connection of `ssh_conn_id`. :type remote_host: str :param command: command to execute on remote host. (templated) :type command: str :param timeout: timeout (in seconds) for executing the command. :type timeout: int """ template_fields = ('command', 'remote_host') template_ext = ('.sh',) @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, command=None, timeout=10, *args, **kwargs): super(SSHOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.command = command self.timeout = timeout def execute(self, context): try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info("ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id, timeout=self.timeout) if not self.ssh_hook: raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: self.log.info("remote_host is provided explicitly. " + "It will replace the remote_host which was defined " + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host if not self.command: raise AirflowException("SSH command not specified. Aborting.") with self.ssh_hook.get_conn() as ssh_client: # Auto apply tty when its required in case of sudo get_pty = False if self.command.startswith('sudo'): get_pty = True self.log.info("Running command: %s", self.command) # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command(command=self.command, get_pty=get_pty, timeout=self.timeout ) # get channels channel = stdout.channel # closing stdin stdin.close() channel.shutdown_write() agg_stdout = b'' agg_stderr = b'' # capture any initial output in case channel is closed already stdout_buffer_length = len(stdout.channel.in_buffer) if stdout_buffer_length > 0: agg_stdout += stdout.channel.recv(stdout_buffer_length) # read from both stdout and stderr while not channel.closed or \ channel.recv_ready() or \ channel.recv_stderr_ready(): readq, _, _ = select([channel], [], [], self.timeout) for c in readq: if c.recv_ready(): line = stdout.channel.recv(len(c.in_buffer)) line = line agg_stdout += line self.log.info(line.decode('utf-8').strip('\n')) if c.recv_stderr_ready(): line = stderr.channel.recv_stderr(len(c.in_stderr_buffer)) line = line agg_stderr += line self.log.warning(line.decode('utf-8').strip('\n')) if stdout.channel.exit_status_ready()\ and not stderr.channel.recv_stderr_ready()\ and not stdout.channel.recv_ready(): stdout.channel.shutdown_read() stdout.channel.close() break stdout.close() stderr.close() exit_status = stdout.channel.recv_exit_status() if exit_status == 0: enable_pickling = configuration.conf.getboolean( 'core', 'enable_xcom_pickling' ) if enable_pickling: return agg_stdout else: return b64encode(agg_stdout).decode('utf-8') else: error_msg = agg_stderr.decode('utf-8') raise AirflowException("error running cmd: {0}, error: {1}" .format(self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) return True def tunnel(self): ssh_client = self.ssh_hook.get_conn() ssh_client.get_transport()
class SFTPToGCSOperator(BaseOperator): """ Copies objects from an sftp host to a GCS bucket :param ssh_hook: predefined ssh_hook to use for remote execution :type ssh_hook: :class:`SSHHook` :param ssh_conn_id: connection id from airflow Connections :type ssh_conn_id: str :param remote_host: remote host to connect :type remote_host: str :param remote_filepath: remote file path to get (templated) :type remote_filepath: str :param destination_gcs_bucket: The destination Google Cloud Storage bucket (templated) :type destination_gcs_bucket: string :param destination_gcs_path: The destination Google Cloud Storage bucket (templated) :type destination_gcs_path: string :param google_cloud_storage_conn_id: The connection ID to use when connecting to Google cloud storage. :type google_cloud_storage_conn_id: string :type delegate_to: string :param replace: Whether you want to replace existing destination files or not. """ template_fields = ('remote_filepath', 'destination_gcs_bucket', 'destination_gcs_path') def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, remote_filepath=None, destination_gcs_bucket=None, destination_gcs_path=None, google_cloud_storage_conn_id='google_cloud_default', delegate_to=None, *args, **kwargs): super(SFTPToGCSOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.remote_filepath = remote_filepath self.destination_gcs_bucket = destination_gcs_bucket self.destination_gcs_path = destination_gcs_path self.google_cloud_storage_conn_id = google_cloud_storage_conn_id self.delegate_to = delegate_to def execute(self, context): gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException( 'can not operate without ssh_hook or ssh_conn_id') if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host with self.ssh_hook.get_conn() as ssh_client: sftp_client = ssh_client.open_sftp() with NamedTemporaryFile('wb') as temp_file: sftp_client.get(self.remote_filepath, temp_file.name) gcs_hook.upload(self.destination_gcs_bucket, self.destination_gcs_path, temp_file.name) except Exception as error_object: raise AirflowException( 'Error while transferring. Error details: {1}'.format( str(error_object))) return None
class SSHOperator(BaseOperator): """ SSHOperator to execute commands on given remote host using the ssh_hook. :param ssh_hook: predefined ssh_hook to use for remote execution :type ssh_hook: :class:`SSHHook` :param ssh_conn_id: connection id from airflow Connections :type ssh_conn_id: str :param remote_host: remote host to connect :type remote_host: str :param command: command to execute on remote host :type command: str :param timeout: timeout for executing the command. :type timeout: int :param do_xcom_push: return the stdout which also get set in xcom by airflow platform :type do_xcom_push: bool """ template_fields = ('command',) @apply_defaults def __init__(self, ssh_hook=None, ssh_conn_id=None, remote_host=None, command=None, timeout=10, do_xcom_push=False, *args, **kwargs): super(SSHOperator, self).__init__(*args, **kwargs) self.ssh_hook = ssh_hook self.ssh_conn_id = ssh_conn_id self.remote_host = remote_host self.command = command self.timeout = timeout self.do_xcom_push = do_xcom_push def execute(self, context): try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("can not operate without ssh_hook or ssh_conn_id") if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host ssh_client = self.ssh_hook.get_conn() if not self.command: raise AirflowException("no command specified so nothing to execute here.") # Auto apply tty when its required in case of sudo get_pty = False if self.command.startswith('sudo'): get_pty = True # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command(command=self.command, get_pty=get_pty, timeout=self.timeout ) exit_status = stdout.channel.recv_exit_status() if exit_status is 0: # only returning on output if do_xcom_push is set # otherwise its not suppose to be disclosed if self.do_xcom_push: return stdout.read() else: error_msg = stderr.read() raise AirflowException("error running cmd: {0}, error: {1}" .format(self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) return True def tunnel(self): ssh_client = self.ssh_hook.get_conn() ssh_client.get_transport()