Example #1
0
class FTPFSHook(FileSystemHook):
    conn_type = 'ftp_filesystem'
    conn_type_long = 'FTP FileSystem'

    def __init__(self, conn_params: Connection):
        from airflow.contrib.hooks.ftp_hook import FTPHook
        self.conn_id = conn_params.conn_id
        self.af_ftp_hook = FTPHook(ftp_conn_id=self.conn_id)
        self.base_path = Path(conn_params.extra_dejson('base_path', '/'))

    def list_path(self, path: str, recursive: bool = False) -> List[str]:
        if recursive:
            raise NotImplementedError('Recursive list not implemented for FTP')
        else:
            return self.af_ftp_hook.list_directory(str(self.base_path / path))

    def write_data(self, path: str, data: Union[str, bytes, BytesIO]):
        if isinstance(data, str):
            data = data.encode()
        if isinstance(data, bytes):
            data = BytesIO(data)
        self.af_ftp_hook.store_file(str(self.base_path / path), data)

    def read_data(self, path: str) -> BytesIO:
        result = BytesIO()
        self.af_ftp_hook.retrieve_file(str(self.base_path / path), result)
        return result
Example #2
0
    def execute(self, context):
        """
        Main execution point. Steps that are done
            1) connecting to Postgres DB
            2) queering DB and pull data into pandas dataframe
            3) dump data from dataframe into file
            4) send file on SFTP server.
        :param context:
        :return: none
        """
        logging.info(f'Preparing dataframe...')
        psql_hook = PostgresExtendedHook().get_hook(self.postgres_conn_id)
        df = psql_hook.get_pandas_df(sql=self.sql)
        logging.info('Writing data into temp file')
        with NamedTemporaryFile() as f:
            if self.file_desc['format'].lower() == 'csv':
                df.to_csv(f.name, sep=';', quotechar='|')
            if self.file_desc['format'].lower() == 'parquet':
                df.to_parquet(f.name, engine='pyarrow')
            try:
                logging.info('Sending file to FTP')
                print(self.sftp_conn_id)
                ftp_hook = FTPHook(ftp_conn_id=self.sftp_conn_id)
                conn = ftp_hook.get_conn()
                f.flush()
                conn.set_pasv(False)
                conn.storbinary(
                    f'STOR {self.file_desc["name"]}.{self.file_desc["format"]}',
                    open(f.name, 'rb'), 1)

            except Exception as ex:
                raise AirflowException(
                    f'Could not put file on SFTP. Details {ex}')
Example #3
0
    def execute(self, context):
        min_date = FTPSearchOperator._get_date_param(self.min_date or '1900-01-01')
        max_date = FTPSearchOperator._get_date_param(self.max_date or '2999-12-31')
        self.log.info("Using ftp connection: %s", self.ftp_conn_id)
        self.log.info("min date: %s ", min_date.to_datetime_string())
        self.log.info("max date: %s ", max_date.to_datetime_string())
        if self.ftp_conn_type == 'sftp':
            self.ftp_hook = SFTPHook(ftp_conn_id=self.ftp_conn_id)
        elif self.ftp_conn_type == 'ftps':
            self.ftp_hook = FTPSHook(ftp_conn_id=self.ftp_conn_id)
        else:
            self.ftp_hook = FTPHook(ftp_conn_id=self.ftp_conn_id)

        self.log.info("ftp connection info: %s ", self.ftp_hook.get_connection(self.ftp_conn_id).port)

        self.log.info("Getting directory listing for %s", self.remote_filepath)
        file_list = self.get_file_list(
            ftp_hook=self.ftp_hook,
            remote_filepath=self.remote_filepath,
            search_expr=self.search_expr,
            min_date=min_date,
            max_date=max_date)

        if file_list is not None:
            self.download_files(self.ftp_hook, file_list)
        else:
            self.log.info("No files found matching filters in %s", self.remote_filepath)

        return self.downloaded_files
Example #4
0
def get_ftp_files(**kwargs):
    conn = FTPHook('accela_ftp')
    d = conn.describe_directory('/opentext/TEST')

    for t in tables:
        r = get_recent_file(t, d)
        conn.retrieve_file(r['name'], f"/tmp/{t}.csv")
Example #5
0
def ftp_disconnect(hook: FTPHook) -> None:
    """Disconnect from FTP.

    Args:
        hook (FTPHook): FTPHook instance.
    """
    try:
        hook.close_conn()
    except:
        None
Example #6
0
def GetFiles(**kwargs):
    """
    this function downloads the files from the source host and writes it into the DB
    """
    ftp = FTPHook(ftp_conn_id=af_conn_id)
    #ftp.get_conn()

    #create a list from all files on the destination what ends with .csv
    files = [x for x in ftp.list_directory(source) if str(x).endswith('.csv')]

    #ftp.close_conn()
    for file in files:
        data_dict = upload_data(ftp, file)
        for filename in data_dict:
            #df = pd.read_csv(StringIO(data_dict[filename]), names=['','','','',''])
            #based on the file names the destination table has to be set
            if filename.startswith("location"):
                table = "location_details_test"
                column_list = 'load from constans'
                #here data modifications could aply
                #df['UPDATE_DATE'] = pd.to_datetime(df['UPDATE_DATE'], format='%Y%m%d')
            if filename.startswith("product"):
                table = "product_details"
                column_list = 'load from constans'

            df = pd.read_csv(StringIO(data_dict[filename]),
                             skiprows=1,
                             names=column_list)
            df['FILE_NAME'] = filename

            db = create_engine(get_postgre_connection(postgre_conn_id))
            db_conn = db.connect()
            try:
                df.to_sql(name=table,
                          con=db_conn,
                          schema='public',
                          if_exists='append',
                          index=False)
            except Exception as error:
                print("An exception occurred:", error)
            db_conn.close()

        taks_id = "S3_" + table
        sftptoaws = SFTPToS3Operator(task_id=taks_id,
                                     sftp_conn_id=af_conn_id,
                                     sftp_path="/root/airflow_test/" + file,
                                     s3_conn_id=s3_conn_id,
                                     s3_bucket='afksiskot',
                                     s3_key=file + '.' + rundt,
                                     dag=dag)

        sftptoaws.execute(context=kwargs)
    def execute(self, context):
        """
        Picks up all files from a source directory and dumps them into a root directory system,
        organized by dagid, taskid and execution_date
        For anonimous username = '******' and password = '******'.
        """
        execution_date = context['execution_date'].strftime(DATE_FORMAT)
        #source_dir = src_hook.get_path()
        #SOme info: self.dag.dag_id, self.task_id
        loging.info("{0} Execution copy file from FTP to Samba Share".format(execution_date))
        if not self.src_conn_id:
            raise AirflowException("Cannot operate without src_conn_id.")
        if not self.dst_conn_id:
            raise AirflowException("Cannot operate without dst_conn_id.")
        src_hook = FTPHook(conn_id=self.src_conn_id)              
        dst_hook = SambaHook(samba_conn_id=self.dst_conn_id)
        #samba.push_from_local(self.destination_filepath, tmp_file.name) 
        
        if not self.src_filename and self.dest_filename:
            raise AirflowException("FtpToSambaOperator: dest_filename specified but no src_filename.")
        
        if not self.src_filename:
            src_hook.
            

            
            
            
Example #8
0
def ftp_connect(ftp_conn_id: str) -> FTPHook:
    """Connect to FTP.

    Args:
        ftp_conn_id (str): ftp conn_id.
    Returns:
        FTPHook: FTPHook instance.
    """
    return FTPHook(ftp_conn_id)
Example #9
0
    def _create_hook(self):
        """
        Create a ftp hook based on the ftp connection id.

        Returns
        -------
        hook : FTPHook
            The created ftp hook.
        """
        return FTPHook(self.ftp_conn_id)
Example #10
0
def GetFiles(**kwargs):
    """
    this function downloads the files from the source host and writes it into the DB
    """
    ftp = FTPHook(ftp_conn_id=af_conn_id)

    #create a list from all files on the destination what ends with .csv
    files = [x for x in ftp.list_directory(source) if str(x).endswith('.csv')]

    #ftp.close_conn()
    for file in files:
        data_dict = upload_data(ftp, file)
        for filename in data_dict:
            df = pd.read_csv(StringIO(data_dict[filename]))
            #based on the file names the destination table has to be set
            if filename.startswith("location"):
                table = "location_details"
                #here data modifications could aply
                #df['UPDATE_DATE'] = pd.to_datetime(df['UPDATE_DATE'], format='%Y%m%d')
            if filename.startswith("product"):
                table = "product_details"

            #db = create_engine(get_postgre_connection(postgre_conn_id))
            #db_conn = db.connect()
            #try:
            #df.to_sql(name=table, con=db_conn, schema='public', if_exists='append', index=False)
            #except Exception as error:
            #print("An exception occurred:", error)
            #db_conn.close()

        ssh_hook = SSHHook(af_conn_id)
        s3_hook = S3Hook(s3_conn_id)

        sftp_client = ssh_hook.get_conn().open_sftp()

        with NamedTemporaryFile("w") as f:
            sftp_client.get(self.sftp_path, f.name)

            s3_hook.load_file(filename=f.name,
                              key=self.s3_key,
                              bucket_name=self.s3_bucket,
                              replace=True)
def download_files(ds, **kwargs):
    fs = FSHook('fs_bioinf')
    force = kwargs['params'].get('force', 'false') == 'true'
    with FTPHook('ftp_ncbi') as ftp:
        for ftp_name, local_name in ftp_files.items():
            local_path = fs.get_path() + '/' + local_name
            uncompressed_local_path = local_path[:-4]
            if (isfile(local_path) or isfile(uncompressed_local_path)) and not force:
                continue
            if not isfile(local_name):
                ftp.retrieve_file(ftp_directory + ftp_name, local_path)
    open(fs.get_path() + '/done.txt', 'wb')
    return True
Example #12
0
    def bulk_load_ib(self, table, database, tmp_file, rel_path='/opt', sep='\t', header = 0, ftp_conn_id='ftp_default'):
        import uuid
        from airflow.contrib.hooks.ftp_hook import FTPHook

        """
        Loads a tab-delimited file into a database table using FTPHook for transfering
        CSV file since ICE don't support LOAD DATA LOCAL INFILE. Doesn't support IGNORE X LINES, NO HEADER

        # TODO : DELETE HEADER OPTION
        """

        logging.info('Load file to table : {}'.format(table))
        logging.info('Using Database: {}'.format(database))
        conn_FTP = FTPHook(ftp_conn_id=ftp_conn_id)
        # Remote temp file name and dir
        tmp_file_remote = database + '_' + table + '.csv'
	rnd = str(uuid.uuid4())
        tmp_dir_remote = '/ibl' + rnd + '/'
        conn_FTP.create_directory(tmp_dir_remote)
        logging.info('Temp folder created : {}'.format(tmp_dir_remote))
        temp_filepath = tmp_dir_remote + tmp_file_remote
        logging.info('Transfering file : {}'.format(temp_filepath))
        remote_filepath = rel_path + temp_filepath
        logging.info('Remote file : {}'.format(remote_filepath))
        try:
            conn_FTP.store_file(temp_filepath, tmp_file)
        except Exception as e:
	    print(str(e))
            logging.warning("Failed to store file")
            conn_FTP.delete_directory(tmp_dir_remote)
            raise
            # Load Remote temp file uploaded to Infobright Server
        try:
            conn = self.get_conn()
            logging.info('Loading data to Infobright...')
            cur = conn.cursor()

            cur.execute("""
                set @bh_dataformat = 'txt_variable'
            """.format(**locals()))

            cur.execute("""
                set @BH_REJECT_FILE_PATH = '/opt/{rnd}_{database}_{table}_reject.txt'
            """.format(**locals()))

            cur.execute("""
                LOAD DATA INFILE '{remote_filepath}'
                INTO TABLE {database}.{table}
		        CHARACTER SET UTF8
                FIELDS TERMINATED BY '{sep}'
                """.format(**locals()))
            conn.commit()
            logging.info('Finished loading data')
        except:
            logging.warning("Failed to execute SQL Statement")
            conn_FTP.delete_file(temp_filepath)
            conn_FTP.delete_directory(tmp_dir_remote)
            raise
        # Remove temp dir after commit
        conn_FTP.delete_file(temp_filepath)
        conn_FTP.delete_directory(tmp_dir_remote)
        logging.info('Removed temp folder')
 def _create_hook(self):
     """Return connection hook."""
     return FTPHook(ftp_conn_id=self.ftp_conn_id)
Example #14
0
 def __init__(self, conn_params: Connection):
     from airflow.contrib.hooks.ftp_hook import FTPHook
     self.conn_id = conn_params.conn_id
     self.af_ftp_hook = FTPHook(ftp_conn_id=self.conn_id)
     self.base_path = Path(conn_params.extra_dejson('base_path', '/'))
 def __init__(self, conn_id):
     FTPHook.__init__(self, ftp_conn_id=conn_id)
Example #16
0
def download_files(ds, **kwargs):
    with FTPHook('ftp_ncbi') as conn:
        for ftp_name, local_name in ftp_files.items():
            if not isfile(local_name):
                conn.retrieve_file(ftp_directory + ftp_name, local_name)
 def __init__(self, conn_id):
     FTPHook.__init__(self, ftp_conn_id=conn_id)
Example #18
0
    # 'on_retry_callback': another_function,
    # 'sla_miss_callback': yet_another_function,
    # 'trigger_rule': 'all_success'
}

dag = DAG(
    dag_id="z3",
    schedule_interval='*/100000 * * * *',
    default_args=default_args
)

REMOTE_PATH = "/data/"
CONN_ID = 'sftp_synchronization'
S3_CONN_ID = "sor_s3"
LOCAL_TMP_DIR = f"/tmp/ftp_files_{datetime.now().microsecond}"
ftp_hook = FTPHook(ftp_conn_id=CONN_ID)
S3_BUCKET = "sap-dqmsdk"
try:
    os.mkdir(LOCAL_TMP_DIR)
except IOError:
    logging.error(f"Can't create {LOCAL_TMP_DIR}")


class FtpDownloadOperator(BaseOperator):
    @apply_defaults
    def __init__(
            self,
            name: str,
            *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.name = name
Example #19
0
    def bulk_load_ib(self,
                     table,
                     database,
                     tmp_file,
                     rel_path='/opt',
                     sep='\t',
                     header=0,
                     ftp_conn_id='ftp_default'):
        import uuid
        from airflow.contrib.hooks.ftp_hook import FTPHook
        """
        Loads a tab-delimited file into a database table using FTPHook for transfering
        CSV file since ICE don't support LOAD DATA LOCAL INFILE. Doesn't support IGNORE X LINES, NO HEADER

        # TODO : DELETE HEADER OPTION
        """

        logging.info('Load file to table : {}'.format(table))
        logging.info('Using Database: {}'.format(database))
        conn_FTP = FTPHook(ftp_conn_id=ftp_conn_id)
        # Remote temp file name and dir
        tmp_file_remote = database + '_' + table + '.csv'
        rnd = str(uuid.uuid4())
        tmp_dir_remote = '/ibl' + rnd + '/'
        conn_FTP.create_directory(tmp_dir_remote)
        logging.info('Temp folder created : {}'.format(tmp_dir_remote))
        temp_filepath = tmp_dir_remote + tmp_file_remote
        logging.info('Transfering file : {}'.format(temp_filepath))
        remote_filepath = rel_path + temp_filepath
        logging.info('Remote file : {}'.format(remote_filepath))
        try:
            conn_FTP.store_file(temp_filepath, tmp_file)
        except Exception as e:
            print(str(e))
            logging.warning("Failed to store file")
            conn_FTP.delete_directory(tmp_dir_remote)
            raise
        # Load Remote temp file uploaded to Infobright Server
        try:
            conn = self.get_conn()
            logging.info('Loading data to Infobright...')
            cur = conn.cursor()

            cur.execute("""
                set @bh_dataformat = 'txt_variable'
            """.format(**locals()))

            cur.execute("""
                set @BH_REJECT_FILE_PATH = '/opt/{rnd}_{database}_{table}_reject.txt'
            """.format(**locals()))

            cur.execute("""
                LOAD DATA INFILE '{remote_filepath}'
                INTO TABLE {database}.{table}
		        CHARACTER SET UTF8
                FIELDS TERMINATED BY '{sep}'
                """.format(**locals()))
            conn.commit()
            logging.info('Finished loading data')
        except:
            logging.warning("Failed to execute SQL Statement")
            conn_FTP.delete_file(temp_filepath)
            conn_FTP.delete_directory(tmp_dir_remote)
            raise
        # Remove temp dir after commit
        conn_FTP.delete_file(temp_filepath)
        conn_FTP.delete_directory(tmp_dir_remote)
        logging.info('Removed temp folder')
Example #20
0
class FTPSearchOperator(BaseOperator):
    """
    FTPOperator for transferring files from remote host to local with attribute search.
    This operator uses ftp_hook for file transfer.
    :type ftp_hook: :class:`SSHHook`
    :param ftp_conn_id: connection id from airflow Connections.
    :type ftp_conn_id: str
    :param local_filepath: local file path to write files. (templated)
    :type local_filepath: str
    :param remote_filepath: remote file path to search. (templated)
    :type remote_filepath: str
    :param search_expr: regex filename search (templated)
    :type search_expr: str
    :param min_date: minimum last modified date (templated)
    :param ftp_conn_type: connection type.  ftp, ftps, sftp
    :type ftp_conn_type: str
    :param max_date: max last modified date (templated)
    """
    template_fields = ('local_filepath', 'remote_filepath', 'search_expr', 'ftp_conn_id', 'ftp_conn_type', 'min_date', 'max_date')
    ui_color = '#bbd2f7'

    @apply_defaults
    def __init__(self,
                 ftp_conn_id=None,
                 ftp_conn_type='ftp',
                 local_filepath="/tmp/",
                 remote_filepath="/",
                 search_expr='*',
                 min_date=None,
                 max_date=None,
                 *args,
                 **kwargs):
        super(FTPSearchOperator, self).__init__(*args, **kwargs)
        self.ftp_hook = None
        self.ftp_conn_id = ftp_conn_id
        self.ftp_conn_type = ftp_conn_type
        self.local_filepath = local_filepath
        self.remote_filepath = remote_filepath
        self.search_expr = search_expr
        self.min_date = min_date
        self.downloaded_files = list()
        self.max_date = max_date

    @staticmethod
    def _get_date_param(param):
        if not param:
            return pendulum.create(1900, 1, 1)
        if isinstance(param, pendulum.datetime):
            return param
        elif isinstance(param, str):
            return pendulum.parse(param)

    def execute(self, context):
        min_date = FTPSearchOperator._get_date_param(self.min_date or '1900-01-01')
        max_date = FTPSearchOperator._get_date_param(self.max_date or '2999-12-31')
        self.log.info("Using ftp connection: %s", self.ftp_conn_id)
        self.log.info("min date: %s ", min_date.to_datetime_string())
        self.log.info("max date: %s ", max_date.to_datetime_string())
        if self.ftp_conn_type == 'sftp':
            self.ftp_hook = SFTPHook(ftp_conn_id=self.ftp_conn_id)
        elif self.ftp_conn_type == 'ftps':
            self.ftp_hook = FTPSHook(ftp_conn_id=self.ftp_conn_id)
        else:
            self.ftp_hook = FTPHook(ftp_conn_id=self.ftp_conn_id)

        self.log.info("ftp connection info: %s ", self.ftp_hook.get_connection(self.ftp_conn_id).port)

        self.log.info("Getting directory listing for %s", self.remote_filepath)
        file_list = self.get_file_list(
            ftp_hook=self.ftp_hook,
            remote_filepath=self.remote_filepath,
            search_expr=self.search_expr,
            min_date=min_date,
            max_date=max_date)

        if file_list is not None:
            self.download_files(self.ftp_hook, file_list)
        else:
            self.log.info("No files found matching filters in %s", self.remote_filepath)

        return self.downloaded_files

    def file_list_filter(self, files, search_expr, min_date, max_date):
        self.log.info("file info: " + json.dumps(files, indent=4))
        return [k for k, v in files.items() if v.get('type') == 'file'
                and (search_expr is None or re.search(search_expr, k))
                and (min_date is None or dateutil.parser.parse(v.get('modify', v.get('modified'))) > min_date)
                and (max_date is None or dateutil.parser.parse(v.get('modify', v.get('modified'))) <= max_date)]

    def get_file_list(self, ftp_hook, remote_filepath, search_expr, min_date, max_date):
        file_list = ftp_hook.describe_directory(remote_filepath)
        if not bool(file_list):
            return None
        else:
            return self.file_list_filter(file_list, search_expr, min_date, max_date)

    def download_files(self, ftp_hook, file_list):
        for filename in file_list:
            full_remote_path = self.remote_filepath + '/' + filename
            full_local_path = self.local_filepath + '/' + filename
            file_msg = "from {0} to {1}".format(full_remote_path, full_local_path)
            self.log.info("Starting to transfer %s", file_msg)
            ftp_hook.retrieve_file(full_remote_path, full_local_path)
            self.downloaded_files.append(full_local_path)