def execute(self, context): self.log.info("Going to start Bulk sftp to s3 operator") sftp_hook = SFTPHook(ftp_conn_id=self.sftp_conn_id) sftp_hook.no_host_key_check = True list_dir = sftp_hook.list_directory(self.sftp_path) if len(list_dir) < 1: self.log.info("Got no files to process. Skipping") return False self.log.info(f"Got {len(list_dir)} files to move") temp_files = [] file_path_list = [] ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() s3_hook = S3Hook(self.aws_conn_id) for file_name in list_dir: file_path = os.path.join(self.sftp_path, file_name) file_path_list.append(file_path) s3_key = str(os.path.join(self.dest_path, file_name)) file_metadata = {"ftp": NamedTemporaryFile("w"), "s3_key": s3_key} for i in range(0, 5): try: self.log.info(f"Downloading {file_path}") sftp_client.get(file_path, file_metadata["ftp"].name) file_metadata["ftp"].flush() temp_files.append(file_metadata) break except Exception: self.log.info( f"Got no response from server, waiting for next try number {(i + 1)}" ) if i < 4: time.sleep(2 ** i + random.random()) sftp_client = ( SSHHook(ssh_conn_id=self.sftp_conn_id) .get_conn() .open_sftp() ) else: raise self.log.info(f"Uploading to S3 with {self.workers} workers") with Pool(self.workers) as pool: pool.starmap( s3_hook.load_file, [ (x["ftp"].name, x["s3_key"], self.dest_bucket, True, False) for x in temp_files ], ) self.log.info("Finished executing Bulk sftp to s3 operator") return file_path_list
def runDAP(**kwargs): """ Connects to App Server via SSH and executes script, capturing and reporting output. """ sshSource = SSHHook(ssh_conn_id='DAP_App_Server') command = 'E:\\Airflow_Test\\DAP\DAPConsoleProcessor.exe -config "E:\\Airflow_Test\\DAP\\Configuration\\DAPOrderCancellation.xml" -jobname "DAPOrderCancellation.xml"' try: sshConn = sshSource.get_conn() stdIn, stdOut, stdErr = sshConn.exec_command(command=command) exitStatus = stdOut.channel.recv_exit_status() errorMessage = stdErr.read().decode('ascii') stdOutput = stdOut.read().decode('ascii') if exitStatus == 0: print('DAP Started Successfully.') if errorMessage: logging.error('DAP Processor Failure. See Exception') raise Exception(errorMessage) finally: print('Exit Status: {}'.format(exitStatus)) print('StdOut: {}'.format(stdOutput)) print('StdErr: {}'.format(errorMessage)) if sshConn: sshConn.close()
def execute(self, context): gcs_hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException( 'can not operate without ssh_hook or ssh_conn_id') if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host with self.ssh_hook.get_conn() as ssh_client: sftp_client = ssh_client.open_sftp() with NamedTemporaryFile('wb') as temp_file: sftp_client.get(self.remote_filepath, temp_file.name) gcs_hook.upload(self.destination_gcs_bucket, self.destination_gcs_path, temp_file.name) except Exception as error_object: raise AirflowException( 'Error while transferring. Error details: {1}'.format( str(error_object))) return None
def execute(self, context): try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info("ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.") with self.ssh_hook.get_conn() as ssh_client: sftp_client = ssh_client.open_sftp() remote_folder = os.path.dirname(self.remote_filepath) if self.create_intermediate_dirs: _make_intermediate_dirs( sftp_client=sftp_client, remote_directory=remote_folder, ) self.log.info("Starting to transfer file to %s", self.remote_filepath) file_contents_fo = StringIO(self.file_contents) sftp_client.putfo(file_contents_fo, self.remote_filepath) if self.file_mode is not None: sftp_client.chmod(self.remote_filepath, self.file_mode) except Exception as e: raise AirflowException("Error while uploading to {0}, error: {1}" .format(self.remote_filepath, str(e))) return self.remote_filepath
def execute(self, context): try: s3_hook = S3Hook(self.s3_conn_id) logging.info("Connected to S3 hook") except AirflowException as e: logging.info("Error in Connecting to S3 Hook") exit(1) try: ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) logging.info("Connected to SSH Hook") ssh_client = ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() logging.info("Connecting to SFTP") except AirflowException as e: logging.info("Error in Connecting to SFTP") exit(1) try: with NamedTemporaryFile("w") as f: sftp_client.get(self.sftp_path, f.name) s3_hook.load_file(filename=f.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=True) logging.info("SUCCEEDED") except AirflowException as e: logging.info("Transfer to S3 FAILED", str(e)) exit(1)
def execute(self, context): """ :raises AirflowException: when the SSH endpoint of the HDI cluster cannot be found """ azure_hook = AzureHDInsightHook(azure_conn_id=self.azure_conn_id) azure_conn_opts = azure_hook.get_connection( self.azure_conn_id).extra_dejson ssh_username = azure_conn_opts['SSH_USER_NAME'] ssh_password = azure_conn_opts['SSH_PASSWORD'] state = azure_hook.get_cluster_state(self.cluster_name) for endpoint in state.connectivity_endpoints: if endpoint.name == 'SSH': ssh_endpoint = endpoint.location ssh_port = endpoint.port if not ssh_endpoint: raise AirflowException( "Could not find SSH endpoint for cluster {}", self.cluster_name) self.ssh_hook = SSHHook(remote_host=ssh_endpoint, port=ssh_port, username=ssh_username, password=ssh_password) self.log.info("Running SSH command on cluster (%s): %s", self.cluster_name, self.command) super(AzureHDInsightSshOperator, self).execute(context)
def get_sub_ssh_cmds_dag(parent_dag, task_id, args): ssh_dag = DAG( '%s.%s' % (parent_dag.dag_id, task_id), default_args=args, start_date=args['start_date'], schedule_interval=parent_dag.schedule_interval, ) start = DummyOperator( task_id='ssh_start', dag=ssh_dag) end = DummyOperator( task_id='ssh_end', dag=ssh_dag) # generate the task to submit dynamically depending on the number of hive script that needs to be run response = s3_client.list_objects_v2(Bucket=wk_conf.get('s3_bucket'),Prefix=wk_conf.get('s3_hive_script_location')) hive_scripts = [c.get('Key') for c in response.get('Contents')] if len(hive_scripts)>0: ssh_emr_hook = SSHHook(conn_id='ssh_emr_default') ssh_tasks = [ SSHExecuteOperator( task_id=str(key.replace(':','_').replace('/','_')), ssh_hook=ssh_emr_hook, bash_command='hive -f "s3://'+wk_conf.get('s3_bucket')+'/'+str(key)+'"', dag=ssh_dag) for key in hive_scripts if key.endswith('hql')] start.set_downstream(ssh_tasks) end.set_upstream(ssh_tasks) # if no hive scripts generrated short circuit step in the begining of main dag return ssh_dag
def execute(self, context): file_msg = None try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException( "can not operate without ssh_hook or ssh_conn_id") if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host ssh_client = self.ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() if self.operation.lower() == SFTPOperation.GET: file_msg = "from {0} to {1}".format(self.remote_filepath, self.local_filepath) self.log.debug("Starting to transfer %s", file_msg) sftp_client.get(self.remote_filepath, self.local_filepath) else: file_msg = "from {0} to {1}".format(self.local_filepath, self.remote_filepath) self.log.debug("Starting to transfer file %s", file_msg) sftp_client.put(self.local_filepath, self.remote_filepath, confirm=self.confirm) except Exception as e: raise AirflowException( "Error while transferring {0}, error: {1}".format( file_msg, str(e))) return None
def setUp(self): from airflow.contrib.hooks.ssh_hook import SSHHook hook = SSHHook(ssh_conn_id='ssh_default') hook.no_host_key_check = True args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, } dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.dag = dag self.test_dir = "/tmp" self.test_local_dir = "/tmp/tmp2" self.test_remote_dir = "/tmp/tmp1" self.test_local_filename = 'test_local_file' self.test_remote_filename = 'test_remote_file' self.test_local_filepath = '{0}/{1}'.format(self.test_dir, self.test_local_filename) # Local Filepath with Intermediate Directory self.test_local_filepath_int_dir = '{0}/{1}'.format( self.test_local_dir, self.test_local_filename) self.test_remote_filepath = '{0}/{1}'.format(self.test_dir, self.test_remote_filename) # Remote Filepath with Intermediate Directory self.test_remote_filepath_int_dir = '{0}/{1}'.format( self.test_remote_dir, self.test_remote_filename)
def execute(self, context): ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) ssh_client = ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() # Get list of files in sftp_path self.log.info(f"Getting list of files in sftp_path: `{self.sftp_folder_path}`") path_content = sftp_client.listdir(self.sftp_folder_path) files = [ file for file in path_content if fnmatch.fnmatch(file, self.sftp_filename) ] sftp_object = None try: if not files: self.log.info( f"No files found in folder that matches `{self.sftp_filename}` parameter." ) for file in files: sftp_object = os.path.join(self.sftp_folder_path, file) sftp_client.remove(path=sftp_object) self.log.info(f"Deleted file `{sftp_object}`") except IOError as ex: # IOError raised by client does not consistently use the same # number of arguments when raised. When a file does not exist # the first argument is the error code `2`. If a folder is # passed then only a text error is used. If a permissions # error occurs then the first argument is error code 13. # # We only want to handle when a file does not exist, all other # exceptions should be reraised to fail the Airflow task. if ex.args[0] == 2: self.log.info(f"File does not exist `{sftp_object}`") else: raise
def setUp(self): from airflow.contrib.hooks.ssh_hook import SSHHook from airflow.hooks.S3_hook import S3Hook hook = SSHHook(ssh_conn_id='ssh_default') s3_hook = S3Hook('aws_default') hook.no_host_key_check = True args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'provide_context': True } dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.s3_hook = s3_hook self.ssh_client = self.hook.get_conn() self.sftp_client = self.ssh_client.open_sftp() self.dag = dag self.s3_bucket = BUCKET self.sftp_path = SFTP_PATH self.s3_key = S3_KEY
def execute(self, context): if not self._adls_hook: self._adls_hook = ADLSGen2Hook( container=self.adls_container, azure_data_lake_conn_id=self.azure_data_lake_conn_id, ) ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) ssh_client = ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() # Get list of files in ADLS folder source_files = [ os.path.split(file)[1] # get only the file portion of the path for file in self._get_adls_files() if fnmatch.fnmatch(os.path.split(file)[1], self.source_object) ] self.log.info(f"Source Files: `{source_files}`") # Get list of files in sftp_path try: self.log.info( f"Getting list of files in sftp_path: `{self.sftp_folder_path}`" ) sftp_files = sftp_client.listdir(self.sftp_folder_path) except IOError as e: self.log.error( f"The folder `{self.sftp_folder_path}` does not exist on the sftp server." ) raise e # determine the files to be processed. If all files are to be reloaded # then process all filesin the ADLS folder that match the `source object`. # If all files are not to be reloaded then only process files for which # the file name does not currently exist in the sftp folder if self.reload_all: files_to_process = source_files self.log.info(f"Files to process: `{files_to_process}`") else: self.log.info(f"Existing files in sftp folder: `{sftp_files}`") files_to_process = set(source_files) - set(sftp_files) self.log.info(f"Files to process: `{files_to_process}`") # create temporary folder and process files with tempfile.TemporaryDirectory() as temp_folder: for file in files_to_process: temp_path = os.path.join(temp_folder, file) adls_object = os.path.join(self.adls_folder_path, file) sftp_object = os.path.join(self.sftp_folder_path, file) self.log.info(f"Processing file: `{adls_object}`") self._adls_hook.download_file( local_path=temp_path, remote_path=adls_object, overwrite=True ) sftp_client.put(localpath=temp_path, remotepath=sftp_object) os.remove(temp_path) # Close ADLS Connection self._adls_hook.connection.close()
def execute(self, context): file_msg = None try: if self.ssh_conn_id: if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): self.log.info( "ssh_conn_id is ignored when ssh_hook is provided.") else: self.log.info("ssh_hook is not provided or invalid. " + "Trying ssh_conn_id to create SSHHook.") self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException( "Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: self.log.info( "remote_host is provided explicitly. " + "It will replace the remote_host which was defined " + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host with self.ssh_hook.get_conn() as ssh_client: sftp_client = ssh_client.open_sftp() if self.operation.lower() == SFTPOperation.GET: local_folder = os.path.dirname(self.local_filepath) if self.create_intermediate_dirs: # Create Intermediate Directories if it doesn't exist try: os.makedirs(local_folder) except OSError: if not os.path.isdir(local_folder): raise file_msg = "from {0} to {1}".format( self.remote_filepath, self.local_filepath) self.log.debug("Starting to transfer %s", file_msg) sftp_client.get(self.remote_filepath, self.local_filepath) else: remote_folder = os.path.dirname(self.remote_filepath) if self.create_intermediate_dirs: _make_intermediate_dirs( sftp_client=sftp_client, remote_directory=remote_folder, ) file_msg = "from {0} to {1}".format( self.local_filepath, self.remote_filepath) self.log.debug("Starting to transfer file %s", file_msg) sftp_client.put(self.local_filepath, self.remote_filepath, confirm=self.confirm) except Exception as e: raise AirflowException( "Error while transferring {0}, error: {1}".format( file_msg, str(e))) return None
def test_conn_with_extra_parameters(self): db.merge_conn( models.Connection( conn_id='ssh_with_extra', host='localhost', conn_type='ssh', extra='{"compress" : true, "no_host_key_check" : "true"}')) ssh_hook = SSHHook(ssh_conn_id='ssh_with_extra') self.assertEqual(ssh_hook.compress, True) self.assertEqual(ssh_hook.no_host_key_check, True)
def check_for_file_py(**kwargs): path = kwargs.get('path', None) sftp_conn_id = kwargs.get('sftp_conn_id', None) #filename = kwargs.get('templates_dict').get('filename', None) ssh_hook = SSHHook(ssh_conn_id=sftp_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() ftp_files = sftp_client.listdir(path) for filename in ftp_files: print(filename) logging.info('Filename: ' + str(filename))
def execute(self, context): try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException( "can not operate without ssh_hook or ssh_conn_id") if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host ssh_client = self.ssh_hook.get_conn() if not self.command: raise AirflowException( "no command specified so nothing to execute here.") # Auto apply tty when its required in case of sudo get_pty = False if self.command.startswith('sudo'): get_pty = True # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command( command=self.command, get_pty=get_pty, timeout=self.timeout) stdin.close() output = b'' for line in stdout: output += line.encode('utf-8') self.log.info(line.strip('\n')) exit_status = stdout.channel.recv_exit_status() if exit_status is 0: # only returning on output if do_xcom_push is set # otherwise its not suppose to be disclosed if self.do_xcom_push: enable_pickling = configuration.getboolean( 'core', 'enable_xcom_pickling') if enable_pickling: return output else: return b64encode(output).decode('utf-8') else: error_msg = stderr.read() raise AirflowException( "error running cmd: {0}, error: {1}".format( self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) return True
def get_crawler_report() -> str: """Get crawler report.""" ssh = SSHHook(ssh_conn_id='ssh_big_airflow') client = ssh.get_conn() stdin, stdout, stderr = client.exec_command(""" docker exec `docker ps --filter name=bigscrapy_projects_airflow -q` \ sh -c 'cat /bigcrawler-scrapy/summary.txt' """) message = "".join([line for line in stdout.readlines()]) print(f'crawler_report: {message}') return message
def execute(self, context): self.s3_key = self.get_s3_key(self.s3_key) ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) s3_hook = S3Hook(self.s3_conn_id) s3_client = s3_hook.get_conn() sftp_client = ssh_hook.get_conn().open_sftp() with NamedTemporaryFile("w") as f: s3_client.download_file(self.s3_bucket, self.s3_key, f.name) sftp_client.put(f.name, self.sftp_path)
def setUp(self): from airflow.contrib.hooks.ssh_hook import SSHHook hook = SSHHook(ssh_conn_id='ssh_default') hook.no_host_key_check = True args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, } dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.dag = dag
def execute(self, context): if not self._adls_hook: self._adls_hook = ADLSGen2Hook( container=self.adls_container, azure_data_lake_conn_id=self.azure_data_lake_conn_id, ) ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) ssh_client = ssh_hook.get_conn() sftp_client = ssh_client.open_sftp() # Get list of files in sftp_path self.log.info(f"Getting list of files in sftp_path: `{self.sftp_folder_path}`") path_content = sftp_client.listdir(self.sftp_folder_path) files = [ file for file in path_content if fnmatch.fnmatch(file, self.sftp_filename) ] # Get files that already exist in the ADLS folder existing_files = self._get_adls_files() # Determine the files to be processed. If all files are to be reloaded then process all files # in the sftp file list. If all files are not to be reloaded then only process files for # which the file name does not currently exist in the ADLS folder if self.reload_all: files_to_process = files else: existing_set = {os.path.split(filename)[1] for filename in existing_files} files_to_process = set(files) - existing_set self.log.info(f"Existing files in ADLS: `{existing_set}`") self.log.info(f"Files to process: `{files_to_process}`") # create temporary folder and process files with tempfile.TemporaryDirectory() as temp_folder: for file in files_to_process: temp_path = os.path.join(temp_folder, file) adls_object = os.path.join(self.adls_folder_path, file) sftp_object = os.path.join(self.sftp_folder_path, file) self.log.info(f"Processing: `{sftp_object}`") try: sftp_client.get(sftp_object, temp_path) self._adls_hook.upload_file( local_path=temp_path, remote_path=adls_object, overwrite=self.reload_all, ) os.remove(temp_path) except IOError: self.log.info(f"Skipping directory `{sftp_object}`.") # Close ADLS Connection self._adls_hook.connection.close()
def test_conn_with_extra_parameters(self): from airflow.contrib.hooks.ssh_hook import SSHHook db.merge_conn( models.Connection(conn_id='ssh_with_extra', host='localhost', conn_type='ssh', extra='{"compress" : true, "no_host_key_check" : "true"}' ) ) ssh_hook = SSHHook(ssh_conn_id='ssh_with_extra', keepalive_interval=10) ssh_hook.get_conn() self.assertEqual(ssh_hook.compress, True) self.assertEqual(ssh_hook.no_host_key_check, True)
def execute(self, context): self.s3_key = self.get_s3_key(self.s3_key) ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) s3_hook = S3Hook(self.s3_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() with NamedTemporaryFile("w") as f: sftp_client.get(self.sftp_path, f.name) s3_hook.load_file(filename=f.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=True)
def setUp(self): configuration.load_test_config() from airflow.contrib.hooks.ssh_hook import SSHHook hook = SSHHook() hook.no_host_key_check = True args = { 'owner': 'airflow', 'start_date': DEFAULT_DATE, 'provide_context': True } dag = DAG(TEST_DAG_ID + 'test_schedule_dag_once', default_args=args) dag.schedule_interval = '@once' self.hook = hook self.dag = dag
def execute(self, context): ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) gcs_hook = GoogleCloudStorageHook(self.google_cloud_storage_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() with NamedTemporaryFile("w") as f: filename = f.name gcs_hook.download(bucket=self.gcs_bucket, object=self.gcs_dest, filename=filename) file_msg = "from {0} to {1}".format(filename, self.sftp_dest_path) self.log.info("Starting to transfer file %s", file_msg) sftp_client.put(filename, self.sftp_dest_path, confirm=True)
def execute(self, context): ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) s3_hook = S3Hook(self.s3_conn_id) s3_files = s3_hook.list_keys(bucket_name=self.s3_bucket, prefix=self.s3_path) s3_client = s3_hook.get_conn() sftp_client = ssh_hook.get_conn().open_sftp() for key in s3_files: file_name = key.split("/")[-1] with NamedTemporaryFile("w") as f: s3_client.download_file(self.s3_bucket, key, f.name) sftp_client.put(f.name, os.path.join(self.sftp_path, file_name))
def test_ssh_connection_without_password(self, ssh_mock): hook = SSHHook(remote_host='remote_host', port='port', username='******', timeout=10, key_file='fake.file') with hook.get_conn(): ssh_mock.return_value.connect.assert_called_once_with( hostname='remote_host', username='******', key_filename='fake.file', timeout=10, compress=True, port='port', sock=None)
def test_tunnel_without_password(self, ssh_mock): hook = SSHHook(remote_host='remote_host', port='port', username='******', timeout=10, key_file='fake.file') with hook.get_tunnel(1234): ssh_mock.assert_called_once_with('remote_host', ssh_port='port', ssh_username='******', ssh_pkey='fake.file', ssh_proxy=None, local_bind_address=('localhost', ), remote_bind_address=('localhost', 1234), host_pkey_directories=[], logger=hook.log)
def test_ssh_connection_with_private_key_extra(self, ssh_mock): hook = SSHHook( ssh_conn_id=self.CONN_SSH_WITH_PRIVATE_KEY_EXTRA, remote_host='remote_host', port='port', username='******', timeout=10, ) with hook.get_conn(): ssh_mock.return_value.connect.assert_called_once_with( hostname='remote_host', username='******', pkey=TEST_PKEY, timeout=10, compress=True, port='port', sock=None)
def test_tunnel(self): hook = SSHHook(ssh_conn_id='ssh_default') import subprocess import socket server_handle = subprocess.Popen(["python", "-c", HELLO_SERVER_CMD], stdout=subprocess.PIPE) with hook.create_tunnel(2135, 2134): server_output = server_handle.stdout.read(5) self.assertEqual(server_output, b"ready") s = socket.socket() s.connect(("localhost", 2135)) response = s.recv(5) self.assertEqual(response, b"hello") s.close() output, _ = server_handle.communicate() self.assertEqual(server_handle.returncode, 0)
def GetFiles(**kwargs): """ this function downloads the files from the source host and writes it into the DB """ ftp = FTPHook(ftp_conn_id=af_conn_id) #create a list from all files on the destination what ends with .csv files = [x for x in ftp.list_directory(source) if str(x).endswith('.csv')] #ftp.close_conn() for file in files: data_dict = upload_data(ftp, file) for filename in data_dict: df = pd.read_csv(StringIO(data_dict[filename])) #based on the file names the destination table has to be set if filename.startswith("location"): table = "location_details" #here data modifications could aply #df['UPDATE_DATE'] = pd.to_datetime(df['UPDATE_DATE'], format='%Y%m%d') if filename.startswith("product"): table = "product_details" #db = create_engine(get_postgre_connection(postgre_conn_id)) #db_conn = db.connect() #try: #df.to_sql(name=table, con=db_conn, schema='public', if_exists='append', index=False) #except Exception as error: #print("An exception occurred:", error) #db_conn.close() ssh_hook = SSHHook(af_conn_id) s3_hook = S3Hook(s3_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() with NamedTemporaryFile("w") as f: sftp_client.get(self.sftp_path, f.name) s3_hook.load_file(filename=f.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=True)