Exemple #1
0
class ProductDownloader:
    DOWNLOAD_URL_BASE = 'https://scihub.copernicus.eu/apihub/odata/v1'
    TEMP_FILE_ROOT = '/mnt/state/luigi/esadownloader'

    def __init__(self, debug):
        self.config = ConfigManager("app.cfg")
        self.debug = debug
        self.logger = logging.getLogger('luigi-interface')

    def __createTempPath(self, runDate):
        tempPath = os.path.join(self.TEMP_FILE_ROOT,
                                runDate.strftime("%Y-%m-%d"))

        if not os.path.isdir(tempPath):
            os.makedirs(tempPath)

        return tempPath

    def __download_product(self, product, tempPath):
        uniqueId = product["uniqueId"]
        name = product["title"]

        url = "%s/Products('%s')/$value" % (self.DOWNLOAD_URL_BASE, uniqueId)

        zipname = "%s.zip" % name
        tempFilename = os.path.join(tempPath, zipname)

        if self.debug:
            self.logger.debug("download url: %s, would create %s", url,
                              tempFilename)
            return tempFilename

        try:
            with open(tempFilename, 'wb') as f:
                c = pycurl.Curl()
                c.setopt(c.URL, url)
                c.setopt(c.FOLLOWLOCATION, True)
                c.setopt(c.SSL_VERIFYPEER, False)
                c.setopt(c.USERPWD, self.config.get_esa_credentials())
                c.setopt(c.WRITEFUNCTION, f.write)
                c.perform()
                c.close()

        except pycurl.error as e:
            msg = "%s product %s resulted in download error %s" % (
                downloadType, name, e.args[0])
            raise Exception(msg)

        return tempFilename

    def __verify_zip_file(self, productZipFile):
        if not zipfile.is_zipfile(productZipFile):
            return False

        with zipfile.ZipFile(productZipFile, 'r') as archive:
            if archive.testzip() is not None:
                return False

        return True

    def __copy_product_to_s3(self, sourcepath, filename, awsAccessKeyId,
                             awsSecretKey):
        #max size in bytes before uploading in parts. between 1 and 5 GB recommended
        MAX_SIZE = 5000000000
        #size of parts when uploading in parts
        PART_SIZE = 100000000

        conn = boto.s3.connect_to_region('eu-west-1',
                                         aws_access_key_id=awsAccessKeyId,
                                         aws_secret_access_key=awsSecretKey,
                                         is_secure=True)

        bucket_name = self.config.getAmazonBucketName()
        amazonDestPath = self.config.getAmazonDestPath()
        bucket = conn.get_bucket(bucket_name)

        destpath = os.path.join(amazonDestPath, filename)

        if self.debug:
            self.logger.debug("S3 copy would copy %s to %s", sourcepath,
                              amazonDestPath)
        else:
            if bucket.get_key(destpath) != None:
                bucket.delete_key(destpath)

            filesize = os.path.getsize(sourcepath)
            if filesize > MAX_SIZE:
                mp = bucket.initiate_multipart_upload(destpath)
                fp = open(sourcepath, 'rb')
                fp_num = 0
                while (fp.tell() < filesize):
                    fp_num += 1
                    mp.upload_part_from_file(fp,
                                             fp_num,
                                             num_cb=10,
                                             size=PART_SIZE)

                mp.complete_upload()

            else:
                k = boto.s3.key.Key(bucket)
                k.key = destpath
                k.set_contents_from_filename(sourcepath, num_cb=10)

        return destpath

    def download_products(self, productListFile, runDate, awsAccessKeyId,
                          awsSecretKey, dbConnectionString):
        productList = json.load(productListFile)

        downloadedProductCount = 0
        errorCount = 0

        tempPath = self.__createTempPath(runDate)

        with CatalogManager(dbConnectionString) as cat:
            for product in productList["products"]:
                # download product
                productZipFile = None
                try:
                    productZipFile = self.__download_product(product, tempPath)
                    self.logger.info("Downloaded product %s", product["title"])
                except Exception as e:
                    self.logger.warn(
                        "Failed to download product %s with error %s ",
                        product["title"], e)
                    continue

                if productZipFile is None and not self.debug:
                    continue

                # verify product
                if not self.debug:
                    verified = self.__verify_zip_file(productZipFile)
                    self.logger.info("Verified product %s", product["title"])
                    if not verified:
                        self.logger.warn(
                            "Failed to download product %s with error invalid zip file",
                            product["title"])
                        continue

                # transfer to s3
                try:
                    product["location"] = self.__copy_product_to_s3(
                        productZipFile, product["title"], awsAccessKeyId,
                        awsSecretKey)
                    self.logger.info(
                        "Coppied product %s to S3 bucket, removing temp file",
                        product["title"])
                except Exception as e:
                    self.logger.warn(
                        "Failed to copy product %s to S3 with error %s",
                        product["title"], e)
                    continue

                if not self.debug:
                    os.remove(productZipFile)

                # add metadata to catalog
                if not self.debug:
                    cat.addProduct(product)
                else:
                    self.logger.info("DEBUG: Add product to catalog %s",
                                     product["title"])

                downloadedProductCount = downloadedProductCount + 1

        return downloadedProductCount