def __get_paths_since_last_run(self, filesystem_client: FileSystemClient,
                                   max_files: int) -> List[PathProperties]:
        state = self.__retrieve_transformation_state(filesystem_client)

        last_successful_run = self.__get_datetime_from_string(
            state['last_successful_run'])
        if 'last_backfill_start' in state:
            self.last_backfill_start = self.__get_datetime_from_string(
                state['last_backfill_start'])

        now = datetime.utcnow()
        time_range = pd.date_range(last_successful_run, now, freq='H')
        paths = []

        for timeslot in time_range:
            folder_path = f'{self.guid}/year={timeslot.year}/month={timeslot.month:02d}/day={timeslot.day:02d}' + \
                          f'/hour={timeslot.hour:02d}'

            paths.extend(
                self.__get_file_paths_from_folder(folder_path,
                                                  filesystem_client))

            if len(paths) > max_files:
                break

        paths_return = paths[:max_files]

        for path in paths_return:
            # Set 'processing' tag when the file is getting processed
            metadata = {
                'processing': datetime.utcnow().strftime(self.DATE_FORMAT)
            }
            filesystem_client.get_file_client(path.name).set_metadata(metadata)

        return paths_return
    def __get_file_paths_from_folder(
            self, folder_path: str,
            file_system_client: FileSystemClient) -> List[PathProperties]:
        try:
            paths = list(file_system_client.get_paths(path=folder_path))
            unprocessed_files = []

            for path in paths:
                # Check for 0-bytes files: Can be ignored
                # - If a 0-byte file exists - then the ingress-api will send error code back to the adapter,
                #   which has responsibility of ingesting data again.
                if path.content_length == 0:
                    message = f'0-byte file skipped: {path.name}'
                    logger.warning(message)
                    continue

                processed_file_metadata = file_system_client.get_file_client(
                    path.name).get_file_properties().metadata

                # We check if data should be processed again
                if 'processed' not in processed_file_metadata:
                    unprocessed_files.append(path)
                elif self.last_backfill_start > self.__get_datetime_from_string(
                        processed_file_metadata['processed']):
                    unprocessed_files.append(path)

            return unprocessed_files
        except ResourceNotFoundError:
            return []
 def __retrieve_transformation_state(
         self, filesystem_client: FileSystemClient) -> Dict:
     with filesystem_client.get_file_client(
             f'{self.guid}/{self.STATE_FILE}') as file_client:
         try:
             state = file_client.download_file().readall()
             return json.loads(state)
         except ResourceNotFoundError:
             return {'last_successful_run': '2018-01-01T00:00:00Z'}
 def __save_transformation_state(self, filesystem_client: FileSystemClient,
                                 state: Dict):
     with filesystem_client.get_file_client(
             f'{self.guid}/{self.STATE_FILE}') as file_client:
         json_data = json.dumps(state)
         file_client.upload_data(json_data, overwrite=True)