Beispiel #1
0
class Workflow:
    def __init__(self,
                 db_connection,
                 logger,
                 date,
                 max_downloads=None,
                 max_upload_workers=20,
                 allow_repeat=False):
        self.max_downloads = max_downloads
        self.max_upload_workers = max_upload_workers
        self.total_downloads = 0
        self.upload_queue = Queue()
        self.lock = Lock()
        self.allow_repeat = allow_repeat

        # Setup the database connection
        self.db_connection = db_connection
        self.job_serializer = Serializer(self.db_connection, job)
        self.granule_serializer = Serializer(self.db_connection, granule)
        self.logger = logger
        self.date = date

        self.logger.info('Creating a workflow')

        # Downloader that handles asynchronous downloads.
        self.logger.info('Creating aria2 downloader client')
        self.downloader = Downloader(
            on_download_error=self._on_download_error,
            on_download_complete=self._on_download_complete,
            callback_args=(self.lock, ))

        # Copernicus Search API
        self.logger.info('Creating copernicus API connector')
        end_date = (self.date + timedelta(days=1))
        start_date = self.date  # self._get_start_date(self.date, end_date)

        self.copernicus = Copernicus(
            start_date=start_date.isoformat(),
            end_date=end_date.isoformat(),
            rows_per_query=30,
        )

    def start(self, parent_state):
        # Start a new job in the database.
        self.job_id = self.job_serializer.post({
            'job_name': 'Sentinel-2 Downloader',
            'start_time': datetime.now(),
            'date_handled': self.date,
            'status': JobStatus.STARTED,
        })

        self.logger.set_job_id(self.job_id)
        self.logger.info('Starting workflow')

        # Upload workers
        self.logger.info('Creating S3 upload processes')
        self.upload_processes = [
            Process(target=upload_worker,
                    args=(self.upload_queue, self.job_id, i))
            for i in range(self.max_upload_workers)
        ]

        parent_state.job_id = self.job_id
        self.at_least_one_failed_download = False

        # Start processes responsible for uploading downloaded files to S3
        # in a concurrent fashion.
        self.logger.info('S3 upload processes started')
        [upload_process.start() for upload_process in self.upload_processes]

        # Let's read in each url and download them.
        count = 0
        self.logger.info('Fetching granules info from Copernicus',
                         f'Reading for date: {self.date.isoformat()}')

        for product in self.copernicus.read_feed():
            # Check if granule is already in the database.
            existing = self.granule_serializer.get(product.id)

            if existing is None:
                # If not, add it to the database.
                self.granule_serializer.post({
                    'uuid':
                    product.id,
                    'title':
                    product.title,
                    'copernicus_ingestion_date':
                    product.ingestion_date,
                    'validated':
                    False,
                    'downloader_job_id':
                    self.job_id,
                    'download_status':
                    DownloadStatus.NOT_STARTED,
                })
            elif not self.allow_repeat and \
                    existing['download_status'] not in ['ERROR', 'INVALID']:
                # If it was not a failed download, just skip.
                continue

            # And start the download.
            url = product.get_download_link()
            self.logger.info(f'Starting granule download {product.id}',
                             f'URL: {url}')

            self.downloader.start_download(product)
            self.granule_serializer.put(
                product.id, {'download_status': DownloadStatus.DOWNLOADING})

            count += 1
            if self.max_downloads is not None and count >= self.max_downloads:
                break

            while count - self.total_downloads >= 30:
                sleep(5)

        # Set max_downloads to actual number of downloads triggered.
        # This is needed so that we can send an DONE message to S3 uploader
        # when all download completes.
        self.lock.acquire()
        self.max_downloads = count
        # Of course, if all downloads have already been completed at this
        # point, we need to handle that case as well.
        if self.total_downloads == self.max_downloads:
            self.upload_queue.put('DONE')
        self.lock.release()

        # Join the separately running upload processes.
        [upload_process.join() for upload_process in self.upload_processes]

        self.logger.info('All granules downloaded',
                         f'Total granules processed: {self.total_downloads}')
        parent_state.completed = True

        # At this point, we assume that all downloads and uploads have been
        # completed for this job.

        # Stop listening for any download notifications.
        self.downloader.stop_listening()

        # Set the job status to success.
        self.job_serializer.put(self.job_id, {
            'end_time': datetime.now(),
            'status': JobStatus.SUCCESS
        })

        self.logger.info('Stopping workflow')
        self.logger.set_job_id(None)

    def _on_download_error(self, downloader, gid, lock):
        product = downloader.get_download_product(gid)

        error_message, error_code = downloader.get_download_error(gid)
        self.logger.error(f'Download error {product.id}',
                          f'Error Code: {error_code}\n{error_message}')
        self.at_least_one_failed_download = True

        # Download status = ERROR
        self.granule_serializer.put(product.id,
                                    {'download_status': DownloadStatus.ERROR})

        # If these are all the downloads, trigger the upload processes to quit.
        lock.acquire()
        self.total_downloads += 1
        if self.total_downloads == self.max_downloads:
            self.upload_queue.put('DONE')
        lock.release()

    def _on_download_complete(self, downloader, gid, lock):
        product = downloader.get_download_product(gid)
        self.logger.info(f'Download complete {product.id}')

        # After each file downloads, we want to upload it to S3 bucket.
        filename = downloader.get_download_filename(gid)
        self.upload_queue.put((product.id, filename))

        # If these are all the downloads, trigger the upload processes to quit.
        lock.acquire()
        self.total_downloads += 1
        if self.total_downloads == self.max_downloads:
            self.upload_queue.put('DONE')
        lock.release()