Beispiel #1
0
    def execute(self, context):
        sftp_hook = SFTPHook(ftp_conn_id=self.sftp_conn_id)
        s3_hook = S3Hook(self.s3_conn_id)

        sftp_files = sftp_hook.list_directory(self.sftp_path)
        filtered_files_by_extensions = [
            key for key in sftp_files
            if key.lower().endswith(self.file_extensions)
        ]

        for sftp_file in filtered_files_by_extensions:
            with NamedTemporaryFile("w") as f:
                sftp_hook.retrieve_file(f'{self.sftp_path}/{sftp_file}',
                                        f.name)

                s3_key = self.get_s3_key(f'{self.s3_prefix}/{sftp_file}')
                s3_hook.load_file(filename=f.name,
                                  key=s3_key,
                                  bucket_name=self.s3_bucket,
                                  replace=True)

        # Add the empty _SUCCESS file to indicate the task is done successfully
        s3_key = self.get_s3_key(f'{self.s3_prefix}/_SUCCESS')
        s3_hook.load_string('',
                            key=s3_key,
                            bucket_name=self.s3_bucket,
                            replace=True)
Beispiel #2
0
class SFTPSensor(BaseSensorOperator):
    """
    Waits for a file or directory to be present on SFTP.
    :param path: Remote file or directory path
    :type path: str
    :param sftp_conn_id: The connection to run the sensor against
    :type sftp_conn_id: str
    """
    template_fields = ('path', )

    @apply_defaults
    def __init__(self, path, sftp_conn_id='sftp_default', *args, **kwargs):
        super(SFTPSensor, self).__init__(*args, **kwargs)
        self.path = path
        self.hook = SFTPHook(sftp_conn_id=sftp_conn_id)

    def poke(self, context):
        logging.info('Poking for %s', self.path)
        try:
            self.hook.get_mod_time(self.path)
        except IOError as e:
            if e.errno != SFTP_NO_SUCH_FILE:
                raise e
            return False
        self.hook.close_conn()
        return True
Beispiel #3
0
 def setUp(self):
     configuration.load_test_config()
     self.old_login = self.update_connection(SFTP_CONNECTION_USER)
     self.hook = SFTPHook()
     os.makedirs(os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS))
     with open(os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS), 'a') as file:
         file.write('Test file')
Beispiel #4
0
class SFTPSensor(BaseSensorOperator):
    @apply_defaults
    def __init__(self,
                 filepath,
                 filepattern,
                 sftp_conn_id='sftp_default',
                 *args,
                 **kwargs):
        super(SFTPSensor, self).__init__(*args, **kwargs)
        self.filepath = filepath
        self.filepattern = filepattern
        self.sftp_conn_id = sftp_conn_id
        self.hook = SFTPHook(ftp_conn_id=sftp_conn_id, keepalive_interval=10)

    def poke(self, context):
        full_path = self.filepath
        file_pattern = re.compile(self.filepattern)
        fileDict = {}
        fileList = []

        try:
            isFound = False
            directory = self.hook.describe_directory(full_path)
            logging.info('Polling Interval 1')
            for file in directory.keys():
                if not re.match(file_pattern, file):
                    self.log.info(file)
                    self.log.info(file_pattern)
                    del directory[file]

            if not directory:
                # If directory has no files that match the mask, exit
                return isFound

            # wait before we compare file sizes and timestamps again to
            # verify that the file is done transferring to remote loc
            time.sleep(30)

            logging.info('Post-Wait Polling')
            newDirectoryResults = self.hook.describe_directory(full_path)

            for file in newDirectoryResults.keys():
                if file in directory.keys():
                    if newDirectoryResults[file]['size'] == directory[file]['size'] and \
                        newDirectoryResults[file]['modify'] == directory[file]['modify']:

                        fileList.append(file)
                        print(
                            'filename: {} with size {} and modified time of {} met all criteria to be moved.'
                            .format(file, newDirectoryResults[file]['size'],
                                    newDirectoryResults[file]['modify']))
                        isFound = True

            context["task_instance"].xcom_push("file_name", fileList)

            return isFound
        except IOError as e:
            if e.errno != SFTP_NO_SUCH_FILE:
                raise e
            return False
Beispiel #5
0
    def execute(self, context):
        source_hook = SFTPHook(ftp_conn_id=self.source_conn_id)
        source_files = source_hook.list_directory(self.source_path)
        for target in self.target_full_path:
            target_connection = target[0]
            target_path = target[1]
            self.log.info(
                f"Beginning transfer to SFTP site {target_connection} and directory {target_path}"
            )
            target_hook = SFTPHook(ftp_conn_id=target_connection)
            target_files = target_hook.list_directory(target_path)

            for file in source_files:
                if self.filter_function is None or self.filter_function(file):
                    if self.overwrite_target is True or file not in target_files:
                        source_hook.retrieve_file(
                            op.join(self.source_path, file),
                            op.join(self.work_path, file))
                        self.log.info(
                            "Downloaded the file %s from the source SFTP",
                            file)
                        try:
                            target_hook.store_file(
                                op.join(target_path, file),
                                op.join(self.work_path, file))
                            self.log.info(
                                "Uploaded the file %s to the destination SFTP",
                                file)
                        finally:
                            os.remove(os.path.join(self.work_path, file))
    def execute(self, context):
        sftp_hook = SFTPHook(ftp_conn_id=self.sftp_conn_id)
        s3_hook = S3Hook(self.s3_conn_id)

        s3_client = s3_hook.get_conn()
        sftp_client = sftp_hook.get_conn()

        s3_keys = s3_hook.list_keys(self.s3_bucket, prefix=self.s3_prefix)

        s3_keys_filtered_by_extensions = [
            s3_key for s3_key in s3_keys
            if s3_key.lower().endswith(self.file_extensions)
        ]

        part_count = 0

        for s3_key in s3_keys_filtered_by_extensions:
            with NamedTemporaryFile("w") as f:
                s3_client.download_file(self.s3_bucket, s3_key, f.name)

                _, file_extension = os.path.splitext(s3_key)
                remote_filename = f'{self.sftp_filename_prefix}-part-{part_count}{file_extension}'
                remote_path = os.path.join(self.sftp_path, remote_filename)

                sftp_client.put(f.name, remote_path)

                part_count += 1
class SFTPSensor(BaseSensorOperator):
    """
    Waits for a file or directory to be present on SFTP.

    :param path: Remote file or directory path
    :type path: str
    :param sftp_conn_id: The connection to run the sensor against
    :type sftp_conn_id: str
    """
    template_fields = ('path',)

    @apply_defaults
    def __init__(self, path, sftp_conn_id='sftp_default', *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.path = path
        self.hook = SFTPHook(sftp_conn_id)

    def poke(self, context):
        self.log.info('Poking for %s', self.path)
        try:
            self.hook.get_mod_time(self.path)
        except IOError as e:
            if e.errno != SFTP_NO_SUCH_FILE:
                raise e
            return False
        self.hook.close_conn()
        return True
Beispiel #8
0
def check_file(**context):
    """Recebe como input path para um ficheiro, uma conn id para a maquina com o ficheiro e 
    testa se o ficheiro existe retornando os valore definidos nos inputs if_true e if_false

    Inputs:
        templates_dict:
            f_path str -- full path para o ficheiro (templated)
        op_kwargs:
            if_true [any] -- valor de retorno se ficheiro existir
            if_false [any] -- valor de retorno se ficheiro não existir

    Returns:
        [any] -- Retorno definido pelas variaveis if_true e if_false
    """
    f_path = context['templates_dict']['file_path']
    conn = context['conn_id']
    if_true = context['id_true']
    if_false = context['id_false']

    sh = SFTPHook(conn)

    if sh.path_exists(f_path):
        return if_true
    else:
        return if_false
Beispiel #9
0
class SFTPSensor(BaseSensorOperator):
    """
    #Airflow sftp sensor monitors a particular location for a particular file pattern
    """
    @apply_defaults
    def __init__(self, filepath, filepattern, sftp_conn_id='sftp_default', *args, **kwargs):
        super(SFTPSensor, self).__init__(*args, **kwargs)
        self.filepath = filepath
        self.filepattern = filepattern
        self.hook = SFTPHook(sftp_conn_id)

    def poke(self, context):
        full_path = self.filepath
        dict_files = {}
        oldest_file = ""
        files = self.hook.list_directory(full_path)
        pattern = self.filepattern

        for file in files:
            if not fnmatch.fnmatch(file, pattern):
                self.log.info(file)
                self.log.info(pattern)
            else:
                self.log.info("File found {}".format(file))
                dict_files[int(self.hook.get_mod_time(full_path + "/" + file))] = file

        print("files found with modified time : {0}".format(dict_files))
        length_dict = len(dict_files)
        if length_dict > 0:
            dict_of_files_sorted = sorted(list(dict_files.keys()))
            oldest_file = dict_files[dict_of_files_sorted[0]]
        context["task_instance"].xcom_push("file_name", oldest_file)
        self.log.info("xcom_pushed : {}".format(oldest_file))
        return True
Beispiel #10
0
def moveFromSourceToLocal(**kwargs):
    """
    Use information from the dag_run passed in by the filefinder DAG to start pulling down a ready file.
    """
    # Variablelize (my word) the dag_run config needed for this step.
    # This might be a good candidate for externalizing
    sftpConn = kwargs['dag_run'].conf['SFTP_Connection_Name']
    sourceFullPath = kwargs['dag_run'].conf['File_Name']

    # Strip the ".ready" from the filename as we get the basename of the file
    fileName = os.path.basename(kwargs['dag_run'].conf['File_Name']).replace(
        '.ready', '')
    destFullPath = os.path.join(LOCAL_LANDING_PATH, fileName)

    sftpHook = SFTPHook(ftp_conn_id=sftpConn)

    conn = sftpHook.get_conn()

    initialMD5sum = getMD5sumRemote(conn, sourceFullPath)
    logging.info('Initial MD5Sum: {}'.format(initialMD5sum))

    sftpHook.retrieve_file(sourceFullPath, destFullPath)

    currentMD5sum = getMD5sumLocal(destFullPath)
    logging.info('currentMD5Sum: {}'.format(currentMD5sum))

    if initialMD5sum != currentMD5sum:
        logging.error(
            'MD5Sum mismatch.  Initial: {}  Post-Transfer: {}'.format(
                initialMD5sum, currentMD5sum))
        raise Exception(
            'MD5Sum values before and after transfer do not match. Possible transfer issue. Initial: {} Post-Transfer: {}'
            .format(initialMD5sum, currentMD5sum))
Beispiel #11
0
    def _copy_single_object(
        self,
        gcs_hook: GoogleCloudStorageHook,
        sftp_hook: SFTPHook,
        source_object: str,
        destination_path: str,
    ) -> None:
        """
        Helper function to copy single object.
        """
        self.log.info(
            "Executing copy of gs://%s/%s to %s",
            self.source_bucket,
            source_object,
            destination_path,
        )

        dir_path = os.path.dirname(destination_path)
        sftp_hook.create_directory(dir_path)

        with NamedTemporaryFile("w") as tmp:
            gcs_hook.download(
                bucket_name=self.source_bucket,
                object_name=source_object,
                filename=tmp.name,
            )
            sftp_hook.store_file(destination_path, tmp.name)

        if self.move_object:
            self.log.info("Executing delete of gs://%s/%s", self.source_bucket,
                          source_object)
            gcs_hook.delete(self.source_bucket, source_object)
 def execute(self, context):
     self.log.info("Going to start delete file sftp operator")
     sftp_hook = SFTPHook(ftp_conn_id=self.sftp_conn_id)
     sftp_hook.no_host_key_check = True
     sftp_hook.delete_file(self.file_path)
     self.log.info("Finished executing delete file sftp operator")
     return True
    def _copy_single_object(
        self,
        gcs_hook: GoogleCloudStorageHook,
        sftp_hook: SFTPHook,
        source_path: str,
        destination_object: str,
    ) -> None:
        """
        Helper function to copy single object.
        """
        self.log.info(
            "Executing copy of %s to gs://%s/%s",
            source_path,
            self.destination_bucket,
            destination_object,
        )

        with NamedTemporaryFile("w") as tmp:
            sftp_hook.retrieve_file(source_path, tmp.name)

            gcs_hook.upload(
                bucket_name=self.destination_bucket,
                object_name=destination_object,
                filename=tmp.name,
                mime_type=self.mime_type,
            )

        if self.move_object:
            self.log.info("Executing delete of %s", source_path)
            sftp_hook.delete_file(source_path)
    def execute(self, context):
        gcs_hook = GoogleCloudStorageHook(gcp_conn_id=self.gcp_conn_id,
                                          delegate_to=self.delegate_to)

        sftp_hook = SFTPHook(self.sftp_conn_id)

        if WILDCARD in self.source_path:
            total_wildcards = self.source_path.count(WILDCARD)
            if total_wildcards > 1:
                raise AirflowException(
                    "Only one wildcard '*' is allowed in source_path parameter. "
                    "Found {} in {}.".format(total_wildcards,
                                             self.source_path))

            prefix, delimiter = self.source_path.split(WILDCARD, 1)
            base_path = os.path.dirname(prefix)

            files, _, _ = sftp_hook.get_tree_map(base_path,
                                                 prefix=prefix,
                                                 delimiter=delimiter)

            for file in files:
                destination_path = file.replace(base_path,
                                                self.destination_path, 1)
                self._copy_single_object(gcs_hook, sftp_hook, file,
                                         destination_path)

        else:
            destination_object = (self.destination_path
                                  if self.destination_path else
                                  self.source_path.rsplit("/", 1)[1])
            self._copy_single_object(gcs_hook, sftp_hook, self.source_path,
                                     destination_object)
Beispiel #15
0
def createTestFile(**kwargs):
    """
    Create a test file on one of the SFTP sites to initiate the transfer process
    """
    SFTP_Name = dag_config['SFTP_Polling_Sites'][0]['SFTP_Name']
    SFTP_Connection_Name = dag_config['SFTP_Polling_Sites'][0]['SFTP_Connection_Name']
    SFTP_Destination_Path = dag_config['SFTP_Polling_Sites'][0]['Feed_Groups'][0]['Feed_Group_Location']
    fileName = os.path.join(SFTP_Destination_Path, 'testfile_{}.txt'.format(randint(0, 9999999)))
    createFileCommand = "echo 'Hello World!' > {}".format(fileName)
    gpgCommand = "gpg --output {}.gpg -e -r [email protected] {}".format(fileName, fileName)

    sftpHook = SFTPHook(ftp_conn_id = SFTP_Connection_Name)

    print('SFTP_Name: {}'.format(SFTP_Name))
    print('SFTP_Connection: {}'.format(SFTP_Connection_Name))
    print('SFTP_Destination_Path: {}'.format(SFTP_Destination_Path))
    print('Random Filename: {}'.format(fileName))
    print('GPG Command: {}'.format(gpgCommand))

    conn = sftpHook.get_conn()

    tempResults = conn.execute(createFileCommand)
    decodedString = [x.decode('utf-8') for x in tempResults]
    print('Create File Results: {}'.format(decodedString))

    tempResults = conn.execute(gpgCommand)
    decodedString = [x.decode('utf-8') for x in tempResults]
    print('GPG Results: {}'.format(decodedString))
def pollForFiles(**kwargs):
    # Create some local scope variables for use later in proc
    sftpConnName = kwargs['SFTP_Connection_Name']
    feedGroups = kwargs['Feed_Groups']
    
    # Connect to SFTP site using provided credentials - should be saved in Connections
    sourceHook = SFTPHook(ftp_conn_id = sftpConnName)

    # Create empty dictionary for storing files that match file masks
    fileMatches = {}

    # Loop through feed locations and their regex for this SFTP site.
    for i in feedGroups:
        fullPath = i['Feed_Group_Location']
        filePattern = i['Feed_Group_Regex']
        feedGroupName = i['Feed_Group_Name']

        logging.info('Evaluating Feed Group {}'.format(feedGroupName))

        try:
            directory = sourceHook.describe_directory(path = fullPath)
            for file in directory.keys():
                if re.match(filePattern, file):
                    fileMatches[os.path.join(fullPath, file)] = directory[file]
        except Exception as e:
            logging.error('Error attempting to poll feed group {} in directory {}'.format(feedGroupName, fullPath))
            raise e

    # If we do not find a file that matches a file mask in any of the directories, exit.
    if not fileMatches:
        return 0

    # If no trigger files or renaming is utilized by the client when placing files on SFTP, we
    #   have to resort to polling for files, waiting for a time period and then comparing the size/modified time
    #   to see if they are ready to pull down.
    time.sleep(SLEEP_TIME)

    for j in feedGroups:
        fullPath = j['Feed_Group_Location']
        filePattern = j['Feed_Group_Regex']
        feedGroupName = j['Feed_Group_Name']

        logging.info('Evaluating Feed Group {} after sleeping'.format(feedGroupName))

        try:
            newDirResults = sourceHook.describe_directory(fullPath)

            for file in newDirResults:
                fullFilePath = os.path.join(fullPath, file)

                if fullFilePath in fileMatches.keys():
                    if newDirResults[file]['size'] == fileMatches[fullFilePath]['size'] and \
                            newDirResults[file]['modify'] == fileMatches[fullFilePath]['modify']:
                        
                        # If file hasn't changed size or modified time since first look, set to ready for another process to pick up and transfer.
                        sourceHook.conn.rename(fullFilePath, fullFilePath + '.ready')
                        logging.info('Tagged the {} file as ready.'.format(fullFilePath))
        except Exception as e:
            logging.error('Error attempting to rename files in feed group {} in directory {}'.format(feedGroupName, fullPath))
            raise e
 def execute(self, context):
     conn = SFTPHook(ftp_conn_id=self.conn_id)
     my_conn = conn.get_conn()
     total_size = my_conn.lstat(self.file_path).st_size
     master_variable_dict = Variable.get(self.master_variable)
     master_variable_dict[self.chunks_variable_name] = math.ceil(total_size / self.chunk_size)
     Variable.set(self.master_variable, master_variable_dict)
     time.sleep(5)
 def execute(self, context):
     self.log.info("Going to start Rename SFTP Operator")
     sftp_hook = SFTPHook(ftp_conn_id=self.sftp_conn_id)
     sftp_hook.no_host_key_check = True
     conn = sftp_hook.get_conn()
     try:
         conn.rename(self.source_file, self.dest_file)
     except IOError:
         self.log.info("File not found, skipping")
     self.log.info("Finished executing RenameSFTPOperator")
 def __init__(self,
              filepath,
              filepattern,
              sftp_conn_id='sftp_default',
              *args,
              **kwargs):
     super(SFTPSensor, self).__init__(*args, **kwargs)
     self.filepath = filepath
     self.filepattern = filepattern
     self.hook = SFTPHook(sftp_conn_id)
def archive_files_in_sftp(**context):
    sftp_conn = SFTPHook(ftp_conn_id=ALMA_SFTP_CONNECTION_ID)
    # Paramiko is the underlying package used for SSH/SFTP conns
    # the paramiko client exposes a lot more core SFTP functionality
    paramiko_conn = sftp_conn.get_conn()

    most_recent_date = context['task_instance'].xcom_pull(
        task_ids='get_list_of_alma_sftp_files_to_transer',
        key='most_recent_date')
    list_of_files = context['task_instance'].xcom_pull(
        task_ids='get_list_of_alma_sftp_files_to_transer')
    archive_path = "archive"

    if archive_path not in sftp_conn.list_directory("./"):
        sftp_conn.create_directory(path=f"./{archive_path}")
    elif str(most_recent_date) not in sftp_conn.list_directory(
            f"./{archive_path}"):
        sftp_conn.create_directory(f"./{archive_path}/{most_recent_date}")

    count = 0
    for filename in list_of_files:
        logging.info(
            f"Moving {filename} to {archive_path}/{most_recent_date}/{filename}"
        )

        paramiko_conn.rename(f"{filename}",
                             f"{archive_path}/{most_recent_date}/{filename}")
        count += 1
    return count
    def execute(self, context):
        self.log.info("Going to start Bulk sftp to s3 operator")
        sftp_hook = SFTPHook(ftp_conn_id=self.sftp_conn_id)
        sftp_hook.no_host_key_check = True
        list_dir = sftp_hook.list_directory(self.sftp_path)

        if len(list_dir) < 1:
            self.log.info("Got no files to process. Skipping")
            return False

        self.log.info(f"Got {len(list_dir)} files to move")
        temp_files = []
        file_path_list = []
        ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id)
        sftp_client = ssh_hook.get_conn().open_sftp()
        s3_hook = S3Hook(self.aws_conn_id)
        for file_name in list_dir:
            file_path = os.path.join(self.sftp_path, file_name)
            file_path_list.append(file_path)
            s3_key = str(os.path.join(self.dest_path, file_name))
            file_metadata = {"ftp": NamedTemporaryFile("w"), "s3_key": s3_key}
            for i in range(0, 5):
                try:
                    self.log.info(f"Downloading {file_path}")
                    sftp_client.get(file_path, file_metadata["ftp"].name)
                    file_metadata["ftp"].flush()
                    temp_files.append(file_metadata)
                    break
                except Exception:
                    self.log.info(
                        f"Got no response from server, waiting for next try number {(i + 1)}"
                    )
                    if i < 4:
                        time.sleep(2 ** i + random.random())
                        sftp_client = (
                            SSHHook(ssh_conn_id=self.sftp_conn_id)
                            .get_conn()
                            .open_sftp()
                        )
                    else:
                        raise

        self.log.info(f"Uploading to S3 with {self.workers} workers")
        with Pool(self.workers) as pool:
            pool.starmap(
                s3_hook.load_file,
                [
                    (x["ftp"].name, x["s3_key"], self.dest_bucket, True, False)
                    for x in temp_files
                ],
            )

        self.log.info("Finished executing Bulk sftp to s3 operator")
        return file_path_list
Beispiel #22
0
 def poke(self, context):
     self.hook = SFTPHook(self.sftp_conn_id)
     self.log.info('Poking for %s', self.path)
     try:
         self.hook.get_mod_time(self.path)
     except IOError as e:
         if e.errno != SFTP_NO_SUCH_FILE:
             raise e
         return False
     self.hook.close_conn()
     return True
Beispiel #23
0
 def __init__(self,
              filepath,
              filepattern,
              sftp_conn_id='sftp_default',
              *args,
              **kwargs):
     super(SFTPSensor, self).__init__(*args, **kwargs)
     self.filepath = filepath
     self.filepattern = filepattern
     self.sftp_conn_id = sftp_conn_id
     self.hook = SFTPHook(ftp_conn_id=sftp_conn_id, keepalive_interval=10)
Beispiel #24
0
def removeFileFromSFTP(**kwargs):
    """
    Delete file from SFTP
    """
    sftpConn = kwargs['dag_run'].conf['SFTP_Connection_Name']
    fileName = kwargs['dag_run'].conf['File_Name']

    sftpHook = SFTPHook(ftp_conn_id=sftpConn)

    logging.info('Attempting to delete {} from {}'.format(fileName, sftpConn))
    sftpHook.delete_file(fileName)
    logging.info('Deletion Successful')
Beispiel #25
0
def pollForFiles(**kwargs):
    """
    Poll the flatfiles directory for files to process.
    """
    fileRegex = r'^BP_STORE.*\.txt$'
    autoPath = '/airflow_test/postgresdb_etl_poc_flatfiles/auto'
    processPath = '/airflow_test/postgresdb_etl_poc_flatfiles/processing'

    sourceHook = SFTPHook(ftp_conn_id='kub2VM')

    fileMatches = {}

    try:
        directory = sourceHook.describe_directory(path=autoPath)
        for file in directory.keys():
            if re.match(fileRegex, file):
                fileMatches[file] = directory[file]
    except Exception as e:
        logging.error('Error attempting to poll directory {}'.format(autoPath))
        raise e

    print('FileMatches: {}'.format(fileMatches))

    for file in fileMatches.keys():
        sourceHook.conn.rename(os.path.join(autoPath, file),
                               os.path.join(processPath, file))

        dag_params = {}
        dag_params['fileName'] = file
        dag_params['processPath'] = processPath

        triggerConfig = {
            'fileName': file,
            'processPath': processPath,
        }

        trigger_dag(dag_id='ProcessStoreFeed',
                    run_id='trig_{}'.format(timezone.utcnow().isoformat()),
                    conf=json.dumps(triggerConfig),
                    execution_date=None,
                    replace_microseconds=False)

        logging.info('Triggered DAG Job for File: {}'.format(file))

        # Introduce a delay between scheduling dags so there is an order to execution.
        # I'm worried that if we submit sub-second for multiples that it'll try to run them all at once.
        time.sleep(10)

    if sourceHook.conn:
        sourceHook.close_conn()
def check_for_file_py(**kwargs):
    path = kwargs.get('path', None)
    logging.info('path type: {} || path value: {}'.format(type(path), path))
    sftp_conn_id = kwargs.get('sftp_conn_id', None)
    filename = kwargs.get('templates_dict').get('filename', None)
    sftp_hook = SFTPHook(ftp_conn_id=sftp_conn_id)
    logging.info('sftp_hook type: {} || sftp_hook value: {}'.format(
        type(sftp_hook), sftp_hook))
    sftp_client = sftp_hook.get_conn()
    fileList = sftp_hook.list_directory(FILEPATH)
    logging.info('FileList: {}'.format(fileList))
    if FILENAME in fileList:
        return True
    else:
        return False
Beispiel #27
0
def sftp_to_pg(**kwargs):
    today = datetime.date.today().strftime('%y%m%d')
    conn = SFTPHook('sftp_cityftp')
    files = conn.describe_directory('/Home/IET/PNC')
    file_name = [
        fn for fn in files.keys()
        if fn.startswith(f"tls.cityofdetroit.out.{today}")
    ][0]
    conn.retrieve_file(f"/Home/IET/PNC/{file_name}", f"/tmp/{file_name}")

    pg_conn = PostgresHook('etl_postgres')
    pg_conn.run("truncate table escrow.escrow")
    pg_conn.run(
        f"copy escrow.escrow from '/tmp/{file_name}' (FORMAT CSV, HEADER FALSE) "
    )
 def execute(self, context):
     conn_source = SFTPHook(ftp_conn_id=self.conn_id_source)
     my_conn_source = conn_source.get_conn()
     source_file = my_conn_source.sftp_client.file(self.file_source_path,
                                                   'r')
     source_file.seek(self.chunk_number * self.chunk_size)
     payload = source_file.read(self.chunk_size)
     client = boto3.client('s3')
     # aws_access_key_id=self.ACCESS_KEY,
     # aws_secret_access_key=self.SECRET_KEY,
     # aws_session_token=self.SESSION_TOKEN)
     client.upload_part(Body=payload,
                        Bucket=self.bucket,
                        Key=self.key,
                        PartNumber=self.chunk_number,
                        UploadId=self.upload_id)
Beispiel #29
0
    def execute(self, context):
        gcs_hook = GoogleCloudStorageHook(gcp_conn_id=self.gcp_conn_id,
                                          delegate_to=self.delegate_to)

        sftp_hook = SFTPHook(self.sftp_conn_id)

        if WILDCARD in self.source_object:
            total_wildcards = self.source_object.count(WILDCARD)
            if total_wildcards > 1:
                raise AirflowException(
                    "Only one wildcard '*' is allowed in source_object parameter. "
                    "Found {} in {}.".format(total_wildcards,
                                             self.source_object))

            prefix, delimiter = self.source_object.split(WILDCARD, 1)
            objects = gcs_hook.list(self.source_bucket,
                                    prefix=prefix,
                                    delimiter=delimiter)

            for source_object in objects:
                destination_path = os.path.join(self.destination_path,
                                                source_object)
                self._copy_single_object(gcs_hook, sftp_hook, source_object,
                                         destination_path)

            self.log.info("Done. Uploaded '%d' files to %s", len(objects),
                          self.destination_path)
        else:
            destination_path = os.path.join(self.destination_path,
                                            self.source_object)
            self._copy_single_object(gcs_hook, sftp_hook, self.source_object,
                                     destination_path)
            self.log.info("Done. Uploaded '%s' file to %s", self.source_object,
                          destination_path)
class SFTPSensor(BaseSensorOperator):
    @apply_defaults
    def __init__(self, filepath, filepattern, sftp_conn_id='sftp_default', *args, **kwargs):
        super(SFTPSensor, self).__init__(*args, **kwargs)
        self.filepath = filepath
        self.filepattern = filepattern
        self.hook = SFTPHook(sftp_conn_id)

    def poke(self, context):
        full_path = self.filepath
        file_pattern = re.compile(self.filepattern)
        fileList = []

        try:
            isFound = False
            directory = self.hook.list_directory(full_path)
            for files in directory:
                if not re.match(file_pattern, files):
                    self.log.info(files)
                    self.log.info(file_pattern)
                else:
                    fileList.append(files)
                    print('I found the file! {}'.format(files))
                    isFound = True

            context["task_instance"].xcom_push("file_name", fileList)

            return isFound
        except IOError as e:
            if e.errno != SFTP_NO_SUCH_FILE:
                raise e
            return False
Beispiel #31
0
    def execute(self, context):
        min_date = FTPSearchOperator._get_date_param(self.min_date or '1900-01-01')
        max_date = FTPSearchOperator._get_date_param(self.max_date or '2999-12-31')
        self.log.info("Using ftp connection: %s", self.ftp_conn_id)
        self.log.info("min date: %s ", min_date.to_datetime_string())
        self.log.info("max date: %s ", max_date.to_datetime_string())
        if self.ftp_conn_type == 'sftp':
            self.ftp_hook = SFTPHook(ftp_conn_id=self.ftp_conn_id)
        elif self.ftp_conn_type == 'ftps':
            self.ftp_hook = FTPSHook(ftp_conn_id=self.ftp_conn_id)
        else:
            self.ftp_hook = FTPHook(ftp_conn_id=self.ftp_conn_id)

        self.log.info("ftp connection info: %s ", self.ftp_hook.get_connection(self.ftp_conn_id).port)

        self.log.info("Getting directory listing for %s", self.remote_filepath)
        file_list = self.get_file_list(
            ftp_hook=self.ftp_hook,
            remote_filepath=self.remote_filepath,
            search_expr=self.search_expr,
            min_date=min_date,
            max_date=max_date)

        if file_list is not None:
            self.download_files(self.ftp_hook, file_list)
        else:
            self.log.info("No files found matching filters in %s", self.remote_filepath)

        return self.downloaded_files
Beispiel #32
0
    def execute(self, context):
        sftp_hook = SFTPHook(ftp_conn_id=self.sftp_conn_id)
        s3_hook = S3Hook(s3_conn_id=self.s3_conn_id)
        sftp_hook.get_conn()
        file_list = sftp_hook.list_directory(self.ftp_folder)
        if (self.filter):
            filter(self.filter, file_list)

        # create tmp directory
        if not os.path.exists(self.tmp_directory):
            os.makedirs(self.tmp_directory)

        for file_name in file_list:
            s3_key_file = self.s3_key + "/" + str(file_name)
            exists = s3_hook.check_for_key(s3_key_file, self.s3_bucket)

            if (exists) and (not self.replace):
                continue

            ftp_file_fullpath = self.ftp_folder + "/" + str(file_name)
            local_file_fullpath = self.tmp_directory + "/" + str(file_name)

            logging.info("Dowloading file [" + str(ftp_file_fullpath) +
                         "] from sftp to local [" + str(local_file_fullpath) +
                         "]")
            sftp_hook.get_file(ftp_file_fullpath, local_file_fullpath)
            logging.info("Done.")
            logging.info("Uploading file [" + str(local_file_fullpath) +
                         "] to S3 on bucket [" + str(self.s3_bucket) +
                         "] and key [" + str(s3_key_file)+"]")
            s3_hook.load_file(local_file_fullpath, s3_key_file,
                              self.s3_bucket, self.replace)
            logging.info("Done.")
class SFTPHookTest(unittest.TestCase):
    def setUp(self):
        configuration.load_test_config()
        self.hook = SFTPHook()
        os.makedirs(os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS))
        with open(os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS), 'a') as f:
            f.write('Test file')

    def test_get_conn(self):
        output = self.hook.get_conn()
        self.assertEqual(type(output), pysftp.Connection)

    def test_close_conn(self):
        self.hook.conn = self.hook.get_conn()
        self.assertTrue(self.hook.conn is not None)
        self.hook.close_conn()
        self.assertTrue(self.hook.conn is None)

    def test_describe_directory(self):
        output = self.hook.describe_directory(TMP_PATH)
        self.assertTrue(TMP_DIR_FOR_TESTS in output)

    def test_list_directory(self):
        output = self.hook.list_directory(
            path=os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS))
        self.assertEqual(output, [])

    def test_create_and_delete_directory(self):
        new_dir_name = 'new_dir'
        self.hook.create_directory(os.path.join(
            TMP_PATH, TMP_DIR_FOR_TESTS, new_dir_name))
        output = self.hook.describe_directory(
            os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS))
        self.assertTrue(new_dir_name in output)
        self.hook.delete_directory(os.path.join(
            TMP_PATH, TMP_DIR_FOR_TESTS, new_dir_name))
        output = self.hook.describe_directory(
            os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS))
        self.assertTrue(new_dir_name not in output)

    def test_store_retrieve_and_delete_file(self):
        self.hook.store_file(
            remote_full_path=os.path.join(
                TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS),
            local_full_path=os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS)
        )
        output = self.hook.list_directory(
            path=os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS))
        self.assertEqual(output, [TMP_FILE_FOR_TESTS])
        retrieved_file_name = 'retrieved.txt'
        self.hook.retrieve_file(
            remote_full_path=os.path.join(
                TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS),
            local_full_path=os.path.join(TMP_PATH, retrieved_file_name)
        )
        self.assertTrue(retrieved_file_name in os.listdir(TMP_PATH))
        os.remove(os.path.join(TMP_PATH, retrieved_file_name))
        self.hook.delete_file(path=os.path.join(
            TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS))
        output = self.hook.list_directory(
            path=os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS))
        self.assertEqual(output, [])

    def test_get_mod_time(self):
        self.hook.store_file(
            remote_full_path=os.path.join(
                TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS),
            local_full_path=os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS)
        )
        output = self.hook.get_mod_time(path=os.path.join(
            TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS))
        self.assertEqual(len(output), 14)

    def tearDown(self):
        shutil.rmtree(os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS))
        os.remove(os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS))
 def setUp(self):
     configuration.load_test_config()
     self.hook = SFTPHook()
     os.makedirs(os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS))
     with open(os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS), 'a') as f:
         f.write('Test file')
Beispiel #35
0
 def __init__(self, path, sftp_conn_id='sftp_default', *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.path = path
     self.hook = SFTPHook(sftp_conn_id)
class SFTPHookTest(unittest.TestCase):
    def setUp(self):
        configuration.load_test_config()
        self.hook = SFTPHook()
        os.makedirs(os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS))
        with open(os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS), 'a') as f:
            f.write('Test file')

    def test_get_conn(self):
        output = self.hook.get_conn()
        self.assertEqual(type(output), pysftp.Connection)

    def test_close_conn(self):
        self.hook.conn = self.hook.get_conn()
        self.assertTrue(self.hook.conn is not None)
        self.hook.close_conn()
        self.assertTrue(self.hook.conn is None)

    def test_describe_directory(self):
        output = self.hook.describe_directory(TMP_PATH)
        self.assertTrue(TMP_DIR_FOR_TESTS in output)

    def test_list_directory(self):
        output = self.hook.list_directory(
            path=os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS))
        self.assertEqual(output, [])

    def test_create_and_delete_directory(self):
        new_dir_name = 'new_dir'
        self.hook.create_directory(os.path.join(
            TMP_PATH, TMP_DIR_FOR_TESTS, new_dir_name))
        output = self.hook.describe_directory(
            os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS))
        self.assertTrue(new_dir_name in output)
        self.hook.delete_directory(os.path.join(
            TMP_PATH, TMP_DIR_FOR_TESTS, new_dir_name))
        output = self.hook.describe_directory(
            os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS))
        self.assertTrue(new_dir_name not in output)

    def test_store_retrieve_and_delete_file(self):
        self.hook.store_file(
            remote_full_path=os.path.join(
                TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS),
            local_full_path=os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS)
        )
        output = self.hook.list_directory(
            path=os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS))
        self.assertEqual(output, [TMP_FILE_FOR_TESTS])
        retrieved_file_name = 'retrieved.txt'
        self.hook.retrieve_file(
            remote_full_path=os.path.join(
                TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS),
            local_full_path=os.path.join(TMP_PATH, retrieved_file_name)
        )
        self.assertTrue(retrieved_file_name in os.listdir(TMP_PATH))
        os.remove(os.path.join(TMP_PATH, retrieved_file_name))
        self.hook.delete_file(path=os.path.join(
            TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS))
        output = self.hook.list_directory(
            path=os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS))
        self.assertEqual(output, [])

    def test_get_mod_time(self):
        self.hook.store_file(
            remote_full_path=os.path.join(
                TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS),
            local_full_path=os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS)
        )
        output = self.hook.get_mod_time(path=os.path.join(
            TMP_PATH, TMP_DIR_FOR_TESTS, TMP_FILE_FOR_TESTS))
        self.assertEqual(len(output), 14)

    @mock.patch('airflow.contrib.hooks.sftp_hook.SFTPHook.get_connection')
    def test_no_host_key_check_default(self, get_connection):
        connection = Connection(login='******', host='host')
        get_connection.return_value = connection
        hook = SFTPHook()
        self.assertEqual(hook.no_host_key_check, False)

    @mock.patch('airflow.contrib.hooks.sftp_hook.SFTPHook.get_connection')
    def test_no_host_key_check_enabled(self, get_connection):
        connection = Connection(
            login='******', host='host',
            extra='{"no_host_key_check": true}')

        get_connection.return_value = connection
        hook = SFTPHook()
        self.assertEqual(hook.no_host_key_check, True)

    @mock.patch('airflow.contrib.hooks.sftp_hook.SFTPHook.get_connection')
    def test_no_host_key_check_disabled(self, get_connection):
        connection = Connection(
            login='******', host='host',
            extra='{"no_host_key_check": false}')

        get_connection.return_value = connection
        hook = SFTPHook()
        self.assertEqual(hook.no_host_key_check, False)

    @mock.patch('airflow.contrib.hooks.sftp_hook.SFTPHook.get_connection')
    def test_no_host_key_check_disabled_for_all_but_true(self, get_connection):
        connection = Connection(
            login='******', host='host',
            extra='{"no_host_key_check": "foo"}')

        get_connection.return_value = connection
        hook = SFTPHook()
        self.assertEqual(hook.no_host_key_check, False)

    @mock.patch('airflow.contrib.hooks.sftp_hook.SFTPHook.get_connection')
    def test_no_host_key_check_ignore(self, get_connection):
        connection = Connection(
            login='******', host='host',
            extra='{"ignore_hostkey_verification": true}')

        get_connection.return_value = connection
        hook = SFTPHook()
        self.assertEqual(hook.no_host_key_check, True)

    @mock.patch('airflow.contrib.hooks.sftp_hook.SFTPHook.get_connection')
    def test_no_host_key_check_no_ignore(self, get_connection):
        connection = Connection(
            login='******', host='host',
            extra='{"ignore_hostkey_verification": false}')

        get_connection.return_value = connection
        hook = SFTPHook()
        self.assertEqual(hook.no_host_key_check, False)

    def tearDown(self):
        shutil.rmtree(os.path.join(TMP_PATH, TMP_DIR_FOR_TESTS))
        os.remove(os.path.join(TMP_PATH, TMP_FILE_FOR_TESTS))