Esempio n. 1
0
class FTPFSHook(FileSystemHook):
    conn_type = 'ftp_filesystem'
    conn_type_long = 'FTP FileSystem'

    def __init__(self, conn_params: Connection):
        from airflow.contrib.hooks.ftp_hook import FTPHook
        self.conn_id = conn_params.conn_id
        self.af_ftp_hook = FTPHook(ftp_conn_id=self.conn_id)
        self.base_path = Path(conn_params.extra_dejson('base_path', '/'))

    def list_path(self, path: str, recursive: bool = False) -> List[str]:
        if recursive:
            raise NotImplementedError('Recursive list not implemented for FTP')
        else:
            return self.af_ftp_hook.list_directory(str(self.base_path / path))

    def write_data(self, path: str, data: Union[str, bytes, BytesIO]):
        if isinstance(data, str):
            data = data.encode()
        if isinstance(data, bytes):
            data = BytesIO(data)
        self.af_ftp_hook.store_file(str(self.base_path / path), data)

    def read_data(self, path: str) -> BytesIO:
        result = BytesIO()
        self.af_ftp_hook.retrieve_file(str(self.base_path / path), result)
        return result
Esempio n. 2
0
def GetFiles(**kwargs):
    """
    this function downloads the files from the source host and writes it into the DB
    """
    ftp = FTPHook(ftp_conn_id=af_conn_id)
    #ftp.get_conn()

    #create a list from all files on the destination what ends with .csv
    files = [x for x in ftp.list_directory(source) if str(x).endswith('.csv')]

    #ftp.close_conn()
    for file in files:
        data_dict = upload_data(ftp, file)
        for filename in data_dict:
            #df = pd.read_csv(StringIO(data_dict[filename]), names=['','','','',''])
            #based on the file names the destination table has to be set
            if filename.startswith("location"):
                table = "location_details_test"
                column_list = 'load from constans'
                #here data modifications could aply
                #df['UPDATE_DATE'] = pd.to_datetime(df['UPDATE_DATE'], format='%Y%m%d')
            if filename.startswith("product"):
                table = "product_details"
                column_list = 'load from constans'

            df = pd.read_csv(StringIO(data_dict[filename]),
                             skiprows=1,
                             names=column_list)
            df['FILE_NAME'] = filename

            db = create_engine(get_postgre_connection(postgre_conn_id))
            db_conn = db.connect()
            try:
                df.to_sql(name=table,
                          con=db_conn,
                          schema='public',
                          if_exists='append',
                          index=False)
            except Exception as error:
                print("An exception occurred:", error)
            db_conn.close()

        taks_id = "S3_" + table
        sftptoaws = SFTPToS3Operator(task_id=taks_id,
                                     sftp_conn_id=af_conn_id,
                                     sftp_path="/root/airflow_test/" + file,
                                     s3_conn_id=s3_conn_id,
                                     s3_bucket='afksiskot',
                                     s3_key=file + '.' + rundt,
                                     dag=dag)

        sftptoaws.execute(context=kwargs)
Esempio n. 3
0
def GetFiles(**kwargs):
    """
    this function downloads the files from the source host and writes it into the DB
    """
    ftp = FTPHook(ftp_conn_id=af_conn_id)

    #create a list from all files on the destination what ends with .csv
    files = [x for x in ftp.list_directory(source) if str(x).endswith('.csv')]

    #ftp.close_conn()
    for file in files:
        data_dict = upload_data(ftp, file)
        for filename in data_dict:
            df = pd.read_csv(StringIO(data_dict[filename]))
            #based on the file names the destination table has to be set
            if filename.startswith("location"):
                table = "location_details"
                #here data modifications could aply
                #df['UPDATE_DATE'] = pd.to_datetime(df['UPDATE_DATE'], format='%Y%m%d')
            if filename.startswith("product"):
                table = "product_details"

            #db = create_engine(get_postgre_connection(postgre_conn_id))
            #db_conn = db.connect()
            #try:
            #df.to_sql(name=table, con=db_conn, schema='public', if_exists='append', index=False)
            #except Exception as error:
            #print("An exception occurred:", error)
            #db_conn.close()

        ssh_hook = SSHHook(af_conn_id)
        s3_hook = S3Hook(s3_conn_id)

        sftp_client = ssh_hook.get_conn().open_sftp()

        with NamedTemporaryFile("w") as f:
            sftp_client.get(self.sftp_path, f.name)

            s3_hook.load_file(filename=f.name,
                              key=self.s3_key,
                              bucket_name=self.s3_bucket,
                              replace=True)