class FTPFSHook(FileSystemHook): conn_type = 'ftp_filesystem' conn_type_long = 'FTP FileSystem' def __init__(self, conn_params: Connection): from airflow.contrib.hooks.ftp_hook import FTPHook self.conn_id = conn_params.conn_id self.af_ftp_hook = FTPHook(ftp_conn_id=self.conn_id) self.base_path = Path(conn_params.extra_dejson('base_path', '/')) def list_path(self, path: str, recursive: bool = False) -> List[str]: if recursive: raise NotImplementedError('Recursive list not implemented for FTP') else: return self.af_ftp_hook.list_directory(str(self.base_path / path)) def write_data(self, path: str, data: Union[str, bytes, BytesIO]): if isinstance(data, str): data = data.encode() if isinstance(data, bytes): data = BytesIO(data) self.af_ftp_hook.store_file(str(self.base_path / path), data) def read_data(self, path: str) -> BytesIO: result = BytesIO() self.af_ftp_hook.retrieve_file(str(self.base_path / path), result) return result
def execute(self, context): """ Main execution point. Steps that are done 1) connecting to Postgres DB 2) queering DB and pull data into pandas dataframe 3) dump data from dataframe into file 4) send file on SFTP server. :param context: :return: none """ logging.info(f'Preparing dataframe...') psql_hook = PostgresExtendedHook().get_hook(self.postgres_conn_id) df = psql_hook.get_pandas_df(sql=self.sql) logging.info('Writing data into temp file') with NamedTemporaryFile() as f: if self.file_desc['format'].lower() == 'csv': df.to_csv(f.name, sep=';', quotechar='|') if self.file_desc['format'].lower() == 'parquet': df.to_parquet(f.name, engine='pyarrow') try: logging.info('Sending file to FTP') print(self.sftp_conn_id) ftp_hook = FTPHook(ftp_conn_id=self.sftp_conn_id) conn = ftp_hook.get_conn() f.flush() conn.set_pasv(False) conn.storbinary( f'STOR {self.file_desc["name"]}.{self.file_desc["format"]}', open(f.name, 'rb'), 1) except Exception as ex: raise AirflowException( f'Could not put file on SFTP. Details {ex}')
def execute(self, context): min_date = FTPSearchOperator._get_date_param(self.min_date or '1900-01-01') max_date = FTPSearchOperator._get_date_param(self.max_date or '2999-12-31') self.log.info("Using ftp connection: %s", self.ftp_conn_id) self.log.info("min date: %s ", min_date.to_datetime_string()) self.log.info("max date: %s ", max_date.to_datetime_string()) if self.ftp_conn_type == 'sftp': self.ftp_hook = SFTPHook(ftp_conn_id=self.ftp_conn_id) elif self.ftp_conn_type == 'ftps': self.ftp_hook = FTPSHook(ftp_conn_id=self.ftp_conn_id) else: self.ftp_hook = FTPHook(ftp_conn_id=self.ftp_conn_id) self.log.info("ftp connection info: %s ", self.ftp_hook.get_connection(self.ftp_conn_id).port) self.log.info("Getting directory listing for %s", self.remote_filepath) file_list = self.get_file_list( ftp_hook=self.ftp_hook, remote_filepath=self.remote_filepath, search_expr=self.search_expr, min_date=min_date, max_date=max_date) if file_list is not None: self.download_files(self.ftp_hook, file_list) else: self.log.info("No files found matching filters in %s", self.remote_filepath) return self.downloaded_files
def get_ftp_files(**kwargs): conn = FTPHook('accela_ftp') d = conn.describe_directory('/opentext/TEST') for t in tables: r = get_recent_file(t, d) conn.retrieve_file(r['name'], f"/tmp/{t}.csv")
def ftp_disconnect(hook: FTPHook) -> None: """Disconnect from FTP. Args: hook (FTPHook): FTPHook instance. """ try: hook.close_conn() except: None
def GetFiles(**kwargs): """ this function downloads the files from the source host and writes it into the DB """ ftp = FTPHook(ftp_conn_id=af_conn_id) #ftp.get_conn() #create a list from all files on the destination what ends with .csv files = [x for x in ftp.list_directory(source) if str(x).endswith('.csv')] #ftp.close_conn() for file in files: data_dict = upload_data(ftp, file) for filename in data_dict: #df = pd.read_csv(StringIO(data_dict[filename]), names=['','','','','']) #based on the file names the destination table has to be set if filename.startswith("location"): table = "location_details_test" column_list = 'load from constans' #here data modifications could aply #df['UPDATE_DATE'] = pd.to_datetime(df['UPDATE_DATE'], format='%Y%m%d') if filename.startswith("product"): table = "product_details" column_list = 'load from constans' df = pd.read_csv(StringIO(data_dict[filename]), skiprows=1, names=column_list) df['FILE_NAME'] = filename db = create_engine(get_postgre_connection(postgre_conn_id)) db_conn = db.connect() try: df.to_sql(name=table, con=db_conn, schema='public', if_exists='append', index=False) except Exception as error: print("An exception occurred:", error) db_conn.close() taks_id = "S3_" + table sftptoaws = SFTPToS3Operator(task_id=taks_id, sftp_conn_id=af_conn_id, sftp_path="/root/airflow_test/" + file, s3_conn_id=s3_conn_id, s3_bucket='afksiskot', s3_key=file + '.' + rundt, dag=dag) sftptoaws.execute(context=kwargs)
def execute(self, context): """ Picks up all files from a source directory and dumps them into a root directory system, organized by dagid, taskid and execution_date For anonimous username = '******' and password = '******'. """ execution_date = context['execution_date'].strftime(DATE_FORMAT) #source_dir = src_hook.get_path() #SOme info: self.dag.dag_id, self.task_id loging.info("{0} Execution copy file from FTP to Samba Share".format(execution_date)) if not self.src_conn_id: raise AirflowException("Cannot operate without src_conn_id.") if not self.dst_conn_id: raise AirflowException("Cannot operate without dst_conn_id.") src_hook = FTPHook(conn_id=self.src_conn_id) dst_hook = SambaHook(samba_conn_id=self.dst_conn_id) #samba.push_from_local(self.destination_filepath, tmp_file.name) if not self.src_filename and self.dest_filename: raise AirflowException("FtpToSambaOperator: dest_filename specified but no src_filename.") if not self.src_filename: src_hook.
def ftp_connect(ftp_conn_id: str) -> FTPHook: """Connect to FTP. Args: ftp_conn_id (str): ftp conn_id. Returns: FTPHook: FTPHook instance. """ return FTPHook(ftp_conn_id)
def _create_hook(self): """ Create a ftp hook based on the ftp connection id. Returns ------- hook : FTPHook The created ftp hook. """ return FTPHook(self.ftp_conn_id)
def GetFiles(**kwargs): """ this function downloads the files from the source host and writes it into the DB """ ftp = FTPHook(ftp_conn_id=af_conn_id) #create a list from all files on the destination what ends with .csv files = [x for x in ftp.list_directory(source) if str(x).endswith('.csv')] #ftp.close_conn() for file in files: data_dict = upload_data(ftp, file) for filename in data_dict: df = pd.read_csv(StringIO(data_dict[filename])) #based on the file names the destination table has to be set if filename.startswith("location"): table = "location_details" #here data modifications could aply #df['UPDATE_DATE'] = pd.to_datetime(df['UPDATE_DATE'], format='%Y%m%d') if filename.startswith("product"): table = "product_details" #db = create_engine(get_postgre_connection(postgre_conn_id)) #db_conn = db.connect() #try: #df.to_sql(name=table, con=db_conn, schema='public', if_exists='append', index=False) #except Exception as error: #print("An exception occurred:", error) #db_conn.close() ssh_hook = SSHHook(af_conn_id) s3_hook = S3Hook(s3_conn_id) sftp_client = ssh_hook.get_conn().open_sftp() with NamedTemporaryFile("w") as f: sftp_client.get(self.sftp_path, f.name) s3_hook.load_file(filename=f.name, key=self.s3_key, bucket_name=self.s3_bucket, replace=True)
def download_files(ds, **kwargs): fs = FSHook('fs_bioinf') force = kwargs['params'].get('force', 'false') == 'true' with FTPHook('ftp_ncbi') as ftp: for ftp_name, local_name in ftp_files.items(): local_path = fs.get_path() + '/' + local_name uncompressed_local_path = local_path[:-4] if (isfile(local_path) or isfile(uncompressed_local_path)) and not force: continue if not isfile(local_name): ftp.retrieve_file(ftp_directory + ftp_name, local_path) open(fs.get_path() + '/done.txt', 'wb') return True
def bulk_load_ib(self, table, database, tmp_file, rel_path='/opt', sep='\t', header = 0, ftp_conn_id='ftp_default'): import uuid from airflow.contrib.hooks.ftp_hook import FTPHook """ Loads a tab-delimited file into a database table using FTPHook for transfering CSV file since ICE don't support LOAD DATA LOCAL INFILE. Doesn't support IGNORE X LINES, NO HEADER # TODO : DELETE HEADER OPTION """ logging.info('Load file to table : {}'.format(table)) logging.info('Using Database: {}'.format(database)) conn_FTP = FTPHook(ftp_conn_id=ftp_conn_id) # Remote temp file name and dir tmp_file_remote = database + '_' + table + '.csv' rnd = str(uuid.uuid4()) tmp_dir_remote = '/ibl' + rnd + '/' conn_FTP.create_directory(tmp_dir_remote) logging.info('Temp folder created : {}'.format(tmp_dir_remote)) temp_filepath = tmp_dir_remote + tmp_file_remote logging.info('Transfering file : {}'.format(temp_filepath)) remote_filepath = rel_path + temp_filepath logging.info('Remote file : {}'.format(remote_filepath)) try: conn_FTP.store_file(temp_filepath, tmp_file) except Exception as e: print(str(e)) logging.warning("Failed to store file") conn_FTP.delete_directory(tmp_dir_remote) raise # Load Remote temp file uploaded to Infobright Server try: conn = self.get_conn() logging.info('Loading data to Infobright...') cur = conn.cursor() cur.execute(""" set @bh_dataformat = 'txt_variable' """.format(**locals())) cur.execute(""" set @BH_REJECT_FILE_PATH = '/opt/{rnd}_{database}_{table}_reject.txt' """.format(**locals())) cur.execute(""" LOAD DATA INFILE '{remote_filepath}' INTO TABLE {database}.{table} CHARACTER SET UTF8 FIELDS TERMINATED BY '{sep}' """.format(**locals())) conn.commit() logging.info('Finished loading data') except: logging.warning("Failed to execute SQL Statement") conn_FTP.delete_file(temp_filepath) conn_FTP.delete_directory(tmp_dir_remote) raise # Remove temp dir after commit conn_FTP.delete_file(temp_filepath) conn_FTP.delete_directory(tmp_dir_remote) logging.info('Removed temp folder')
def _create_hook(self): """Return connection hook.""" return FTPHook(ftp_conn_id=self.ftp_conn_id)
def __init__(self, conn_params: Connection): from airflow.contrib.hooks.ftp_hook import FTPHook self.conn_id = conn_params.conn_id self.af_ftp_hook = FTPHook(ftp_conn_id=self.conn_id) self.base_path = Path(conn_params.extra_dejson('base_path', '/'))
def __init__(self, conn_id): FTPHook.__init__(self, ftp_conn_id=conn_id)
def download_files(ds, **kwargs): with FTPHook('ftp_ncbi') as conn: for ftp_name, local_name in ftp_files.items(): if not isfile(local_name): conn.retrieve_file(ftp_directory + ftp_name, local_name)
# 'on_retry_callback': another_function, # 'sla_miss_callback': yet_another_function, # 'trigger_rule': 'all_success' } dag = DAG( dag_id="z3", schedule_interval='*/100000 * * * *', default_args=default_args ) REMOTE_PATH = "/data/" CONN_ID = 'sftp_synchronization' S3_CONN_ID = "sor_s3" LOCAL_TMP_DIR = f"/tmp/ftp_files_{datetime.now().microsecond}" ftp_hook = FTPHook(ftp_conn_id=CONN_ID) S3_BUCKET = "sap-dqmsdk" try: os.mkdir(LOCAL_TMP_DIR) except IOError: logging.error(f"Can't create {LOCAL_TMP_DIR}") class FtpDownloadOperator(BaseOperator): @apply_defaults def __init__( self, name: str, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.name = name
def bulk_load_ib(self, table, database, tmp_file, rel_path='/opt', sep='\t', header=0, ftp_conn_id='ftp_default'): import uuid from airflow.contrib.hooks.ftp_hook import FTPHook """ Loads a tab-delimited file into a database table using FTPHook for transfering CSV file since ICE don't support LOAD DATA LOCAL INFILE. Doesn't support IGNORE X LINES, NO HEADER # TODO : DELETE HEADER OPTION """ logging.info('Load file to table : {}'.format(table)) logging.info('Using Database: {}'.format(database)) conn_FTP = FTPHook(ftp_conn_id=ftp_conn_id) # Remote temp file name and dir tmp_file_remote = database + '_' + table + '.csv' rnd = str(uuid.uuid4()) tmp_dir_remote = '/ibl' + rnd + '/' conn_FTP.create_directory(tmp_dir_remote) logging.info('Temp folder created : {}'.format(tmp_dir_remote)) temp_filepath = tmp_dir_remote + tmp_file_remote logging.info('Transfering file : {}'.format(temp_filepath)) remote_filepath = rel_path + temp_filepath logging.info('Remote file : {}'.format(remote_filepath)) try: conn_FTP.store_file(temp_filepath, tmp_file) except Exception as e: print(str(e)) logging.warning("Failed to store file") conn_FTP.delete_directory(tmp_dir_remote) raise # Load Remote temp file uploaded to Infobright Server try: conn = self.get_conn() logging.info('Loading data to Infobright...') cur = conn.cursor() cur.execute(""" set @bh_dataformat = 'txt_variable' """.format(**locals())) cur.execute(""" set @BH_REJECT_FILE_PATH = '/opt/{rnd}_{database}_{table}_reject.txt' """.format(**locals())) cur.execute(""" LOAD DATA INFILE '{remote_filepath}' INTO TABLE {database}.{table} CHARACTER SET UTF8 FIELDS TERMINATED BY '{sep}' """.format(**locals())) conn.commit() logging.info('Finished loading data') except: logging.warning("Failed to execute SQL Statement") conn_FTP.delete_file(temp_filepath) conn_FTP.delete_directory(tmp_dir_remote) raise # Remove temp dir after commit conn_FTP.delete_file(temp_filepath) conn_FTP.delete_directory(tmp_dir_remote) logging.info('Removed temp folder')
class FTPSearchOperator(BaseOperator): """ FTPOperator for transferring files from remote host to local with attribute search. This operator uses ftp_hook for file transfer. :type ftp_hook: :class:`SSHHook` :param ftp_conn_id: connection id from airflow Connections. :type ftp_conn_id: str :param local_filepath: local file path to write files. (templated) :type local_filepath: str :param remote_filepath: remote file path to search. (templated) :type remote_filepath: str :param search_expr: regex filename search (templated) :type search_expr: str :param min_date: minimum last modified date (templated) :param ftp_conn_type: connection type. ftp, ftps, sftp :type ftp_conn_type: str :param max_date: max last modified date (templated) """ template_fields = ('local_filepath', 'remote_filepath', 'search_expr', 'ftp_conn_id', 'ftp_conn_type', 'min_date', 'max_date') ui_color = '#bbd2f7' @apply_defaults def __init__(self, ftp_conn_id=None, ftp_conn_type='ftp', local_filepath="/tmp/", remote_filepath="/", search_expr='*', min_date=None, max_date=None, *args, **kwargs): super(FTPSearchOperator, self).__init__(*args, **kwargs) self.ftp_hook = None self.ftp_conn_id = ftp_conn_id self.ftp_conn_type = ftp_conn_type self.local_filepath = local_filepath self.remote_filepath = remote_filepath self.search_expr = search_expr self.min_date = min_date self.downloaded_files = list() self.max_date = max_date @staticmethod def _get_date_param(param): if not param: return pendulum.create(1900, 1, 1) if isinstance(param, pendulum.datetime): return param elif isinstance(param, str): return pendulum.parse(param) def execute(self, context): min_date = FTPSearchOperator._get_date_param(self.min_date or '1900-01-01') max_date = FTPSearchOperator._get_date_param(self.max_date or '2999-12-31') self.log.info("Using ftp connection: %s", self.ftp_conn_id) self.log.info("min date: %s ", min_date.to_datetime_string()) self.log.info("max date: %s ", max_date.to_datetime_string()) if self.ftp_conn_type == 'sftp': self.ftp_hook = SFTPHook(ftp_conn_id=self.ftp_conn_id) elif self.ftp_conn_type == 'ftps': self.ftp_hook = FTPSHook(ftp_conn_id=self.ftp_conn_id) else: self.ftp_hook = FTPHook(ftp_conn_id=self.ftp_conn_id) self.log.info("ftp connection info: %s ", self.ftp_hook.get_connection(self.ftp_conn_id).port) self.log.info("Getting directory listing for %s", self.remote_filepath) file_list = self.get_file_list( ftp_hook=self.ftp_hook, remote_filepath=self.remote_filepath, search_expr=self.search_expr, min_date=min_date, max_date=max_date) if file_list is not None: self.download_files(self.ftp_hook, file_list) else: self.log.info("No files found matching filters in %s", self.remote_filepath) return self.downloaded_files def file_list_filter(self, files, search_expr, min_date, max_date): self.log.info("file info: " + json.dumps(files, indent=4)) return [k for k, v in files.items() if v.get('type') == 'file' and (search_expr is None or re.search(search_expr, k)) and (min_date is None or dateutil.parser.parse(v.get('modify', v.get('modified'))) > min_date) and (max_date is None or dateutil.parser.parse(v.get('modify', v.get('modified'))) <= max_date)] def get_file_list(self, ftp_hook, remote_filepath, search_expr, min_date, max_date): file_list = ftp_hook.describe_directory(remote_filepath) if not bool(file_list): return None else: return self.file_list_filter(file_list, search_expr, min_date, max_date) def download_files(self, ftp_hook, file_list): for filename in file_list: full_remote_path = self.remote_filepath + '/' + filename full_local_path = self.local_filepath + '/' + filename file_msg = "from {0} to {1}".format(full_remote_path, full_local_path) self.log.info("Starting to transfer %s", file_msg) ftp_hook.retrieve_file(full_remote_path, full_local_path) self.downloaded_files.append(full_local_path)