def __init__(self, api_key, config):
        imagery_config = config['imagery']
        self.api_key = api_key
        self.max_clouds_initial = float(imagery_config['max_clouds_initial'])
        self.max_clouds = float(imagery_config['max_clouds']
                                )  # max proportion of pixels that are clouds
        self.max_bad_pixels = float(
            imagery_config['max_bad_pixels']
        )  # max proportion of bad pixels (transmission errors, etc.)
        self.max_nodata = float(
            imagery_config['max_nodata'])  # max nodata values per cellgrid
        self.maximgs = int(imagery_config['maximgs'])  # 15 #10 #20
        self.output_encoding = imagery_config['output_encoding']
        # self.output_filename = imagery_config['output_filename']
        # self.output_filename_csv = imagery_config['output_filename_csv']
        self.catalog_path = imagery_config['catalog_path']
        # self.s3_catalog_bucket = imagery_config['s3_catalog_bucket']
        # self.s3_catalog_prefix = imagery_config['s3_catalog_prefix']
        self.products = {
            'analytic_sr': {
                'item_type': 'PSScene4Band',
                'asset_type': 'analytic_sr',
                'ext': 'tif'
            },
            'analytic': {
                'item_type': 'PSScene4Band',
                'asset_type': 'analytic',
                'ext': 'tif'
            },
            'analytic_xml': {
                'item_type': 'PSScene4Band',
                'asset_type': 'analytic_xml',
                'ext': 'xml'
            },
            'visual': {
                'item_type': 'PSScene3Band',
                'asset_type': 'visual',
                'ext': 'tif'
            }
        }
        self.client = api.ClientV1(api_key=self.api_key)
        self.s3client = boto3.client('s3')
        # self.with_analytic = json.loads(imagery_config['with_analytic'].lower())
        # self.with_analytic_xml = json.loads(imagery_config['with_analytic_xml'].lower())
        # self.with_visual = json.loads(imagery_config['with_visual'].lower())
        # self.with_immediate_cleanup = json.loads(imagery_config['with_immediate_cleanup'].lower())
        # self.local_mode = json.loads(imagery_config['local_mode'].lower())
        # self.s3_only = json.loads(imagery_config['s3_only'].lower())
        # self.transfer = S3Transfer(self.s3client, TransferConfig(use_threads = False))
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.INFO)
        # planet has limitation 5 sec per key (search queries)
        threads_number = imagery_config['threads']
        if threads_number == 'default':
            threads_number = multiprocessing.cpu_count() * 2 + 1
        else:
            threads_number = int(threads_number)

        self.secondary_uploads_executor = FixedThreadPoolExecutor(
            size=threads_number)
Beispiel #2
0
 def __init__(self, api_key):
     self.api_key = api_key
     self.max_clouds_initial = 0.25
     self.max_clouds = 0.01
     self.max_shadows = 0.01
     self.max_bad_pixels = 0.25
     self.max_nodata = 0.25
     self.maximgs = 1
     self.catalog_path = "catalog/"
     self.s3_catalog_bucket = "azavea-africa-test"
     self.s3_catalog_prefix = "planet/images"
     self.products = {
         'analytic_sr': {
             'item_type': 'PSScene4Band',
             'asset_type': 'analytic_sr',
             'ext': 'tif'
         },
         'analytic': {
             'item_type': 'PSScene4Band',
             'asset_type': 'analytic',
             'ext': 'tif'
         },
         'analytic_xml': {
             'item_type': 'PSScene4Band',
             'asset_type': 'analytic_xml',
             'ext': 'xml'
         },
         'visual': {
             'item_type': 'PSScene3Band',
             'asset_type': 'visual',
             'ext': 'tif'
         }
     }
     self.client = api.ClientV1(api_key=api_key)
     self.output_filename = "output.csv"
     self.output_encoding = "utf-8"
     self.s3client = boto3.client('s3')
     self.with_analytic = False
     self.with_analytic_xml = False
     self.with_visual = False
     self.local_mode = False
     self.s3_only = False
     self.transfer = S3Transfer(self.s3client,
                                TransferConfig(use_threads=False))
     self.transfer_config = TransferConfig(use_threads=False)
     self.logger = logging.getLogger(__name__)
     self.logger.setLevel(logging.INFO)
     self.secondary_uploads_executor = FixedThreadPoolExecutor(size=5)
     self.with_immediate_cleanup = False
Beispiel #3
0
    def __init__(self, config):
        db_config = config['database']
        imagery_config = config['imagery']
        self.host = db_config['host']
        self.dbname = db_config['dbname']
        self.user = db_config['user']
        self.password = db_config['password']
        self.master_grid_table = db_config['master_grid_table']
        self.scene_data_table = db_config['scene_data_table']
        self.enabled = eval(db_config['enabled'])
        self.conn = None
        self.skip_existing = eval(imagery_config['skip_existing'])
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.INFO)
        # planet has limitation 5 sec per key (search queries)
        threads_number = imagery_config['threads']
        if threads_number == 'default':
            threads_number = multiprocessing.cpu_count() * 2 + 1
        else:
            threads_number = int(threads_number)

        self.query_executor = FixedThreadPoolExecutor(size=threads_number)
Beispiel #4
0
def download_m2m(directory, username=None, products='STANDARD', dataset='ARD_TILE',
                    N=50000, temporal=None, batch=1000, threads=1, maxcloudcover=80, fields=None):
    """
    Search for and download Landsat Level-2 products to local directory
        Args:
            directory: Relative path to local directory (will be created)
            username: ERS Username (with full M2M download access) [Optional]
            dataset: EarthExplorer Catalog datasetName [Default: ARD_TILE]
            N: Maximum number of search results to return
            products: Comma-delmited list of download products [Default: STANDARD]
            temporal: Search Date image acquired [ Format: %Y-%m-%d or %Y-%m-%d,%Y-%m-%d ]
            batch: How many URLs to request before working on downloads
            threads: Number of download threads to launch in parallel
            max
            fields: JSON dataset-specific metadata fields (see #additionalCriteria)
    """
    # username = '******'
    # dataset = 'ARD_TILE'
    # N = 50000
    api_key = EarthExplorer.login(**credentials(username))
    # temporal = "2000-01-01,2020-12-31"
    # fields = {"Grid Region": "CU", "Horizontal": 11, "Vertical": 9}
    # directory = u'/Users/coloury/Landsat_test'
    # threads = 4
    # products = 'SR'

    log_path = '%s/download.log' % (directory)
    if os.path.exists(log_path):
        os.remove(log_path)

    logging.basicConfig(filename=log_path, filemode='w',
                        format='%(asctime)s %(levelname)-8s %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO)
    logger = logging.getLogger(__name__)
    logger.info('Downloading starts')

    datasets = EarthExplorer.datasets(apiKey=api_key, datasetName=dataset, publicOnly=False)
    matches = [m['datasetName'] for m in datasets]
    if len(matches) > 1 and not any([m == dataset for m in matches]):
        message(['Multiple dataset matches found, please select only 1: ']
                + ['* [%s]: %s' % (m['datasetName'], m['datasetFullName']) for m in datasets], stop=True)

    search = dict(apiKey=api_key, datasetName=dataset, maxResults=N)
    if fields:
        search.update(EarthExplorer.additionalCriteriaValues(api_key, dataset, fields))
    if temporal:
        search.update(EarthExplorer.temporalCriteria(temporal=temporal))

    search['maxCloudCover'] = maxcloudcover
    results = EarthExplorer.search(**search)
    n_results = results['totalHits']
    product_ids = results['results']

    message('Total search results: %d \n' % n_results)
    logger.info('Total search results: %d' % n_results)
    if len(product_ids) < 1:
        logger.error('No valid products returned')
        return

    if not os.path.exists(directory):
        os.makedirs(directory)

    # current users only allowed sending 1 request
    download_pool = FixedThreadPoolExecutor(threads)
    for pids in product_ids:
        entities = pids['entityId']
        download_pool.submit(download_executor, api_key, dataset, entities, products, directory, logger)
    download_pool.drain()
    download_pool.close()
Beispiel #5
0
def main(source_dir, out_dir, threads_number, parallel_mode, clear_threshold):
    # source_dir = '/Users/coloury/Dropbox/transfer_landsat'
    # out_dir = '/Users/coloury/sccd_test'
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    if not os.path.exists(source_dir):
        print('Source directory not exists!')

    if parallel_mode == 'desktop':
        tz = timezone('US/Eastern')
        logging.basicConfig(filename=os.path.join(
            os.getcwd(), 'AutoPrepareDataARD_{}.log'.format(
                datetime.fromtimestamp(time.time()).strftime('%c').replace(
                    " ", "_").replace(":", "-"))),
                            filemode='w+',
                            level=logging.INFO)

        logger = logging.getLogger(__name__)

        tmp_path = os.path.join(out_dir, 'tmp')

        if os.path.exists(tmp_path) is False:
            os.mkdir(tmp_path)

        if threads_number == 0:
            threads_number = multiprocessing.cpu_count()
        else:
            threads_number = int(threads_number)

        print(
            'The thread number to be paralleled is {}'.format(threads_number))

        folder_list = [
            f[0:len(f) - 4] for f in listdir(source_dir)
            if (isfile(join(source_dir, f)) and f.endswith('.tar')
                and f[len(f) - 6:len(f) - 4] == 'SR')
        ]
        width = 5000
        height = 5000
        band_count = 8

        prepare_executor = FixedThreadPoolExecutor(size=threads_number)

        for count, folder in enumerate(folder_list):
            print("it is processing {} th scene in total {} scene ".format(
                count + 1, len(folder_list)))
            prepare_executor.submit(single_image_processing, tmp_path,
                                    source_dir, out_dir, folder,
                                    clear_threshold, width, height, band_count,
                                    count + 1, len(folder_list))

        # await all tile finished
        prepare_executor.drain()

        # await thread pool to stop
        prepare_executor.close()

        logger.info("Final report: finished preparation task ({})".format(
            datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')))

        # count_valid = len(scene_list)
        # logger.warning("Total processing scene number is {}; valid scene number is {}".format(count, count_valid))

        # remove tmp folder
        shutil.rmtree(tmp_path, ignore_errors=True)

    else:  # for HPC mode
        comm = MPI.COMM_WORLD
        rank = comm.Get_rank()
        # query available cores/process number
        n_process = comm.Get_size()
        print('The core number to be paralleled is {}'.format(n_process))

        if rank == 0:
            tz = timezone('US/Eastern')
            # logger = logging.getLogger(__name__)
            # logger.info('AutoPrepareDataARD starts: {}'.format(datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')))
            print('AutoPrepareDataARD starts: {}'.format(
                datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')))

            tmp_path = os.path.join(out_dir, 'tmp')

            # select only _SR
            folder_list = [
                f[0:len(f) - 4] for f in listdir(source_dir)
                if (isfile(join(source_dir, f)) and f.endswith('.tar')
                    and f[len(f) - 6:len(f) - 4] == 'SR')
            ]
            width = 5000
            height = 5000
            band_count = 8

            scene_per_process = ceil(len(folder_list) / n_process)
            # scene number for the last process is smaller than others
            scene_extra = scene_per_process * n_process - len(folder_list)

            # logger.info('The total process number is : {}'.format(n_process))
            # logger.info('scene number per process is : {}'.format(scene_per_process))
            # logger.info('extra scene number is : {}'.format(scene_extra))
            print('The total process number is : {}'.format(n_process))
            print('scene number per process is : {}'.format(scene_per_process))
            print('extra scene number is : {}'.format(scene_extra))
            # if tmp path exists, delete path
            if os.path.exists(tmp_path) is False:
                os.mkdir(tmp_path)
        else:
            # logger = None
            tmp_path = None
            folder_list = None
            width = None
            height = None
            band_count = None
            scene_per_process = None
            scene_extra = None

        # MPI broadcasting variables
        # comm.bcast(logger, root=0)
        tmp_path = comm.bcast(tmp_path, root=0)
        folder_list = comm.bcast(folder_list, root=0)
        width = int(comm.bcast(width, root=0))
        height = int(comm.bcast(height, root=0))
        band_count = int(comm.bcast(band_count, root=0))
        scene_per_process = int(comm.bcast(scene_per_process, root=0))
        scene_extra = int(comm.bcast(scene_extra, root=0))

        # for rank smaller scene_extra, we assigned scene_per_process-1 scenes per core
        if rank < scene_extra:
            for i in range(
                (scene_per_process - 1) * rank,
                (scene_per_process - 1) * rank + scene_per_process - 1):
                folder = folder_list[i]
                single_image_processing(tmp_path, source_dir, out_dir, folder,
                                        clear_threshold, width, height,
                                        band_count, i + 1, len(folder_list))
        else:  # for the last core
            for i in range(
                (scene_per_process - 1) * scene_extra +
                (rank - scene_extra) * scene_per_process,
                (scene_per_process - 1) * scene_extra +
                (rank - scene_extra) * scene_per_process + scene_per_process):
                folder = folder_list[i]
                single_image_processing(tmp_path, source_dir, out_dir, folder,
                                        clear_threshold, width, height,
                                        band_count, i + 1, len(folder_list))
Beispiel #6
0
logging.basicConfig(format='%(message)s', datefmt='%m-%d %H:%M')

api_key = planet_config['api_key']
cellgrid_buffer = 1 + float(imagery_config['cellgrid_buffer'])
master_grid_path = imagery_config['master_grid_path']  # EPSG:4326
GS = "GS"  # growing, wet season
OS = "OS"  # off, dry season

# planet has limitation 5 sec per key (search queries)
threads_number = imagery_config['threads']
if threads_number == 'default':
    threads_number = multiprocessing.cpu_count() * 2 + 1
else:
    threads_number = int(threads_number)

neighbours_executor = FixedThreadPoolExecutor(size=threads_number)

# pclient init
pclient = PClientV1(api_key, config)

# rfclient init
rfclient = RFClient(config)

# psql client init
psqlclient = PSQLPClient(config)
psqlclient.connect()

# aoi
features = geojson.load(open(imagery_config['aoi']))['features']
actual_aoi = shape(MultiPolygon([Polygon(f['geometry']) for f in features]))
Beispiel #7
0
class PClientV1():
    def __init__(self, api_key):
        self.api_key = api_key
        self.max_clouds_initial = 0.25
        self.max_clouds = 0.01
        self.max_shadows = 0.01
        self.max_bad_pixels = 0.25
        self.max_nodata = 0.25
        self.maximgs = 1
        self.catalog_path = "catalog/"
        self.s3_catalog_bucket = "azavea-africa-test"
        self.s3_catalog_prefix = "planet/images"
        self.products = {
            'analytic_sr': {
                'item_type': 'PSScene4Band',
                'asset_type': 'analytic_sr',
                'ext': 'tif'
            },
            'analytic': {
                'item_type': 'PSScene4Band',
                'asset_type': 'analytic',
                'ext': 'tif'
            },
            'analytic_xml': {
                'item_type': 'PSScene4Band',
                'asset_type': 'analytic_xml',
                'ext': 'xml'
            },
            'visual': {
                'item_type': 'PSScene3Band',
                'asset_type': 'visual',
                'ext': 'tif'
            }
        }
        self.client = api.ClientV1(api_key=api_key)
        self.output_filename = "output.csv"
        self.output_encoding = "utf-8"
        self.s3client = boto3.client('s3')
        self.with_analytic = False
        self.with_analytic_xml = False
        self.with_visual = False
        self.local_mode = False
        self.s3_only = False
        self.transfer = S3Transfer(self.s3client,
                                   TransferConfig(use_threads=False))
        self.transfer_config = TransferConfig(use_threads=False)
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.INFO)
        self.secondary_uploads_executor = FixedThreadPoolExecutor(size=5)
        self.with_immediate_cleanup = False

    def __init__(self, api_key, config):
        imagery_config = config['imagery']
        self.api_key = api_key
        self.max_clouds_initial = float(
            imagery_config['max_clouds_initial']
        )  # max initial proportion of pixels that are clouds
        self.max_clouds = float(
            imagery_config['max_clouds']
        )  # max proportion of clouds detected by filter
        self.max_shadows = float(
            imagery_config['max_shadows']
        )  # max proportion of cloud shaodws detected by filter
        self.max_bad_pixels = float(
            imagery_config['max_bad_pixels']
        )  # max proportion of bad pixels (transmission errors, etc.)
        self.max_nodata = float(
            imagery_config['max_nodata'])  # max nodata values per cellgrid
        self.maximgs = int(imagery_config['maximgs'])  # 15 #10 #20
        self.output_encoding = imagery_config['output_encoding']
        self.output_filename = imagery_config['output_filename']
        self.output_filename_csv = imagery_config['output_filename_csv']
        self.catalog_path = imagery_config['catalog_path']
        self.s3_catalog_bucket = imagery_config['s3_catalog_bucket']
        self.s3_catalog_prefix = imagery_config['s3_catalog_prefix']
        self.products = {
            'analytic_sr': {
                'item_type': 'PSScene4Band',
                'asset_type': 'analytic_sr',
                'ext': 'tif'
            },
            'analytic': {
                'item_type': 'PSScene4Band',
                'asset_type': 'analytic',
                'ext': 'tif'
            },
            'analytic_xml': {
                'item_type': 'PSScene4Band',
                'asset_type': 'analytic_xml',
                'ext': 'xml'
            },
            'visual': {
                'item_type': 'PSScene3Band',
                'asset_type': 'visual',
                'ext': 'tif'
            }
        }
        self.client = api.ClientV1(api_key=self.api_key)
        self.s3client = boto3.client('s3')
        self.with_analytic = json.loads(
            imagery_config['with_analytic'].lower())
        self.with_analytic_xml = json.loads(
            imagery_config['with_analytic_xml'].lower())
        self.with_visual = json.loads(imagery_config['with_visual'].lower())
        self.with_immediate_cleanup = json.loads(
            imagery_config['with_immediate_cleanup'].lower())
        self.local_mode = json.loads(imagery_config['local_mode'].lower())
        self.s3_only = json.loads(imagery_config['s3_only'].lower())
        self.transfer = S3Transfer(self.s3client,
                                   TransferConfig(use_threads=False))
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.INFO)
        # planet has limitation 5 sec per key (search queries)
        threads_number = imagery_config['threads']
        if threads_number == 'default':
            threads_number = multiprocessing.cpu_count() * 2 + 1
        else:
            threads_number = int(threads_number)

        self.secondary_uploads_executor = FixedThreadPoolExecutor(
            size=threads_number)

    # there are start_date and end_date present as it should be the part of a row retrieved from psql / tiff file
    def set_filters_sr(self,
                       aoi,
                       start_date='2017-12-15T00:00:00.000Z',
                       end_date='2018-03-15T00:00:00.000Z',
                       id=''):
        # add an asset_filter for only those scenes that have an analytic_sr asset available
        date_filter = {
            'type': 'DateRangeFilter',
            'field_name': 'acquired',
            'config': {
                'gte': start_date,
                'lte': end_date
            }
        }

        cloud_filter = {
            'type': 'RangeFilter',
            'field_name': 'cloud_cover',
            'config': {
                'lte': self.max_clouds_initial  # only for UDM-detected clouds
            }
        }

        bad_pixel_filter = {
            'type': 'RangeFilter',
            'field_name': 'anomalous_pixels',
            'config': {
                'lte': self.max_bad_pixels
            }
        }

        location_filter = api.filters.geom_filter(aoi)

        geometry_filter = {
            "type": "GeometryFilter",
            "field_name": "geometry",
            "config": aoi
        }

        asset_filter = {
            "type": "PermissionFilter",
            "config": ["assets.analytic_sr:download"]
        }

        string_filter = {
            "type": "StringInFilter",
            "field_name": "id",
            "config": [id]
        }

        filters_list = [
            date_filter, cloud_filter, geometry_filter, bad_pixel_filter,
            asset_filter
        ]
        if (id != ''):
            filters_list.append(string_filter)

        # combine filters:
        query = {'type': 'AndFilter', 'config': filters_list}

        return query

    # there are start_date and end_date present as it should be the part of a row retrieved from psql / tiff file
    def set_filters_id(self, id=''):
        asset_filter = {
            "type": "PermissionFilter",
            "config": ["assets.analytic_sr:download"]
        }

        string_filter = {
            "type": "StringInFilter",
            "field_name": "id",
            "config": [id]
        }

        filters_list = [asset_filter, string_filter]

        # combine filters:
        query = {'type': 'AndFilter', 'config': filters_list}

        return query

    @retry(tries=10, delay=2, backoff=2)
    def request_intersecting_scenes(self, query):
        # build the request
        item_types = ['PSScene4Band']  # params["lst_item_types"]
        request = api.filters.build_search_request(query, item_types)

        # post the request
        results = self.client.quick_search(request)
        return results

    # returns a full URI here
    def download_localfs_generic(self,
                                 scene_id,
                                 season='',
                                 asset_type='analytic_sr',
                                 ext='tif',
                                 item_type='PSScene4Band'):
        output_file = "{}{}/{}/{}.{}".format(self.catalog_path, asset_type,
                                             season, scene_id, ext)

        if not os.path.exists(output_file):
            # activation & download
            session = requests.Session()
            session.auth = (self.api_key, '')
            assets_uri = (
                "https://api.planet.com/data/v1/item-types/{}/items/{}/assets/"
            ).format(item_type, scene_id)

            assets_query_result = session.get(assets_uri)

            self.logger.info(assets_query_result.status_code)
            item_activation_json = assets_query_result.json()
            # self.logger.info(item_activation_json)
            item_activation_url = item_activation_json[asset_type]["_links"][
                "activate"]
            response = session.post(item_activation_url)
            self.logger.info(response.status_code)
            while response.status_code != 204:
                time.sleep(30)
                response = session.post(item_activation_url)
                response.status_code = response.status_code
                self.logger.info(response.status_code)

            item_url = 'https://api.planet.com/data/v1/item-types/{}/items/{}/assets/'.format(
                item_type, scene_id)
            result = requests.get(item_url,
                                  auth=HTTPBasicAuth(self.api_key, ''))

            if result.status_code != 200:
                self.logger.info(result.content.decode('utf-8'))

            download_url = result.json()[asset_type]['location']

            # download
            with urllib.request.urlopen(download_url) as response, open(
                    output_file, 'wb') as out_file:
                shutil.copyfileobj(response, out_file)

        return output_file

    # TODO: lots of copy pasting happens there, abstract over it?
    # returns a full S3 URI here
    def download_s3_generic(self,
                            scene_id,
                            season='',
                            asset_type='analytic_sr',
                            ext='tif',
                            item_type='PSScene4Band'):
        output_key = "{}/{}/{}/{}.{}".format(self.s3_catalog_prefix,
                                             asset_type, season, scene_id, ext)
        result_path = 's3://{}/{}'.format(self.s3_catalog_bucket, output_key)

        try:
            self.s3client.head_object(Bucket=self.s3_catalog_bucket,
                                      Key=output_key)
        except botocore.exceptions.ClientError:
            self.logger.exception('Error Encountered')
            self.logger.info("Downloading {}...".format(scene_id))

            # activation & download
            session = requests.Session()
            session.auth = (self.api_key, '')
            assets_uri = (
                "https://api.planet.com/data/v1/item-types/{}/items/{}/assets/"
            ).format(item_type, scene_id)

            assets_query_result = session.get(assets_uri)

            self.logger.info(assets_query_result.status_code)
            item_activation_json = assets_query_result.json()
            # self.logger.info(item_activation_json)
            item_activation_url = item_activation_json[asset_type]["_links"][
                "activate"]
            response = session.post(item_activation_url)
            self.logger.info(response.status_code)
            while response.status_code != 204:
                time.sleep(30)
                response = session.post(item_activation_url)
                response.status_code = response.status_code
                self.logger.info(response.status_code)

            item_url = 'https://api.planet.com/data/v1/item-types/{}/items/{}/assets/'.format(
                item_type, scene_id)
            result = requests.get(item_url,
                                  auth=HTTPBasicAuth(self.api_key, ''))

            if result.status_code != 200:
                self.logger.info(result.content.decode('utf-8'))

            download_url = result.json()[asset_type]['location']

            # upload on s3 directly from the response
            with urllib.request.urlopen(download_url) as response:
                self.s3client.put_object(Body=response.read(),
                                         Bucket=self.s3_catalog_bucket,
                                         Key=output_key)

            # finished
            self.logger.info("Downloaded {}".format(scene_id))

        return result_path

    # returns a full URI here
    def download_localfs_product(self, product_type, scene_id, season=''):
        cfg = self.products[product_type]
        return self.download_localfs_generic(scene_id=scene_id,
                                             season=season,
                                             asset_type=cfg['asset_type'],
                                             ext=cfg['ext'],
                                             item_type=cfg['item_type'])

    # returns a full URI here
    def download_s3_product(self, product_type, scene_id, season=''):
        cfg = self.products[product_type]
        return self.download_s3_generic(scene_id=scene_id,
                                        season=season,
                                        asset_type=cfg['asset_type'],
                                        ext=cfg['ext'],
                                        item_type=cfg['item_type'])

    def download_localfs_analytic_sr(self, scene_id, season=''):
        return self.download_localfs_product('analytic_sr', scene_id, season)

    def download_s3_analytic_sr(self, scene_id, season=''):
        return self.download_s3_product('analytic_sr', scene_id, season)

    def download_localfs_analytic(self, scene_id, season=''):
        return self.download_localfs_product('analytic', scene_id, season)

    def download_s3_analytic(self, scene_id, season=''):
        return self.download_s3_product('analytic', scene_id, season)

    def download_localfs_analytic_xml(self, scene_id, season=''):
        return self.download_localfs_product('analytic_xml', scene_id, season)

    def download_s3_analytic_xml(self, scene_id, season=''):
        return self.download_s3_product('analytic_xml', scene_id, season)

    def download_localfs_visual(self, scene_id, season=''):
        return self.download_localfs_product('visual', scene_id, season)

    def download_s3_visual(self, scene_id, season=''):
        return self.download_s3_product('visual', scene_id, season)

    def upload_s3_csv(self):
        result = ''
        if not self.local_mode:
            output_key = "{}/{}".format(self.s3_catalog_prefix,
                                        self.output_filename.split('/')[-1])
            result = 's3://{}/{}'.format(self.s3_catalog_bucket, output_key)
            self.transfer.upload_file(self.output_filename,
                                      self.s3_catalog_bucket, output_key)
        else:
            result = self.output_filename

        return result

    def upload_s3_csv_csv(self):
        output_key = "{}/{}".format(self.s3_catalog_prefix,
                                    self.output_filename_csv.split('/')[-1])
        result = 's3://{}/{}'.format(self.s3_catalog_bucket, output_key)
        self.transfer.upload_file(self.output_filename_csv,
                                  self.s3_catalog_bucket, output_key)
        return result

    def download_localfs_s3_product(self,
                                    scene_id,
                                    season='',
                                    product_type='analytic_sr'):
        cfg = self.products[product_type]
        asset_type = cfg['asset_type']
        ext = cfg['ext']
        item_type = cfg['item_type']

        filepath = ''
        output_key = "{}/{}/{}/{}.{}".format(self.s3_catalog_prefix,
                                             asset_type, season, scene_id, ext)
        s3_result = 's3://{}/{}'.format(self.s3_catalog_bucket, output_key)
        local_result = "{}{}/{}/{}.{}".format(self.catalog_path, asset_type,
                                              season, scene_id, ext)

        if not self.s3_only:
            if not os.path.exists(local_result):
                if not self.local_mode:
                    try:
                        # if we have file in our s3 bucket, let's pull it down from the S3 (faster)
                        self.s3client.head_object(
                            Bucket=self.s3_catalog_bucket, Key=output_key)
                        filepath = s3_result
                        # self.logger.info("Downloading {} from the internal S3 storage...".format(scene_id))
                        # self.transfer.download_file(self.s3_catalog_bucket, output_key, local_result)
                        # filepath = local_result # filepath = s3_result
                    except botocore.exceptions.ClientError:
                        self.logger.exception('Error Encountered')
                        filepath = self.download_localfs_product(
                            product_type, scene_id, season)
                        self.logger.info("Uploading {}...".format(scene_id))
                        self.s3client.put_object(Bucket=self.s3_catalog_bucket,
                                                 Key=output_key,
                                                 Body=open(filepath, 'rb'))
                        # self.transfer.upload_file(filepath, self.s3_catalog_bucket, output_key)
                else:
                    filepath = self.download_localfs_product(
                        product_type, scene_id, season)
                    s3_result = local_result
            else:
                filepath = local_result
                if self.local_mode:
                    s3_result = local_result
                else:
                    try:
                        self.s3client.head_object(
                            Bucket=self.s3_catalog_bucket, Key=output_key)
                    except botocore.exceptions.ClientError:
                        self.logger.exception('Error Encountered')
                        self.logger.info("Uploading {}...".format(scene_id))
                        self.s3client.put_object(Bucket=self.s3_catalog_bucket,
                                                 Key=output_key,
                                                 Body=open(filepath, 'rb'))
                        # self.transfer.upload_file(filepath, self.s3_catalog_bucket, output_key)
        else:
            s3_result = self.download_s3_product('analytic_sr', scene_id,
                                                 season)
            filepath = s3_result

        return filepath, s3_result

    def download_localfs_s3(self, scene_id, season=''):
        sub_products = []
        if self.with_analytic:
            sub_products.append('with_analytic')

        if self.with_analytic_xml:
            sub_products.append('with_analytic_xml')

        if self.with_visual:
            sub_products.append('with_visual')

        for sub_product in sub_products:
            self.secondary_uploads_executor.submit(
                self.download_localfs_s3_product, scene_id, season,
                sub_product)

        return self.download_localfs_s3_product(scene_id, season)

    def drain(self):
        self.secondary_uploads_executor.drain()

    def cleanup_catalog(self):
        self.logger.info("Catalog cleanup...")
        if self.with_immediate_cleanup & (not self.s3_only):
            for product_type in [
                    'analytic', 'analytic_sr', 'analytic_xml', 'visual'
            ]:
                for season in ['OS', 'GS']:
                    lpath = "{}{}/{}".format(self.catalog_path, product_type,
                                             season)
                    try:
                        shutil.rmtree(lpath, ignore_errors=False)
                        os.makedirs(lpath)
                    except:
                        self.logger.exception('Error Encountered')
                        self.logger.info(
                            "Could not remove a folder: {}".format(lpath))

    def close(self):
        self.secondary_uploads_executor.close()
        self.cleanup_catalog()
Beispiel #8
0
class PSQLPClient():
    def __init__(self, config):
        db_config = config['database']
        imagery_config = config['imagery']
        self.host = db_config['host']
        self.dbname = db_config['dbname']
        self.user = db_config['user']
        self.password = db_config['password']
        self.master_grid_table = db_config['master_grid_table']
        self.scene_data_table = db_config['scene_data_table']
        self.enabled = eval(db_config['enabled'])
        self.conn = None
        self.skip_existing = eval(imagery_config['skip_existing'])
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.INFO)
        # planet has limitation 5 sec per key (search queries)
        threads_number = imagery_config['threads']
        if threads_number == 'default':
            threads_number = multiprocessing.cpu_count() * 2 + 1
        else:
            threads_number = int(threads_number)

        self.query_executor = FixedThreadPoolExecutor(size=threads_number)

    def connect(self):
        if self.enabled:
            self.conn = psycopg2.connect(
                'host={} dbname={} user={} password={}'.format(
                    self.host, self.dbname, self.user, self.password))

    def connection_close(self):
        if self.enabled:
            self.conn.close()

    def get_cursor(self):
        return self.conn.cursor()

    def query_by_extent(self, extent, limit=10):
        curs = self.get_cursor()
        query = ""

        if limit == None:
            query = """SELECT * FROM %s
            WHERE x >= %s AND x <= %s AND y >= %s AND y <= %s
            ORDER BY gid""" % (self.master_grid_table, extent['xmin'],
                               extent['xmax'], extent['ymin'], extent['ymax'])
        else:
            query = """SELECT * FROM %s
                WHERE x >= %s AND x <= %s AND y >= %s AND y <= %s
                ORDER BY gid LIMIT %s""" % (
                self.master_grid_table, extent['xmin'], extent['xmax'],
                extent['ymin'], extent['ymax'], limit)

        curs.execute(query)

        return curs

    def insert_row_with_commit(self, row):
        if self.enabled:
            try:
                curs = self.conn.cursor()
                self.insert_row(row, curs)
                self.conn.commit()
            except:
                conn.rollback()

    def insert_row(self, row, curs):
        if self.enabled:
            # [provider] | scene_id | [cell_id | season] | global_col | global_row | url | tms_url | date_time
            curs.execute(
                """INSERT INTO {} VALUES (%s, %s, %s, %s, %s, %s, %s, %s, now())"""
                .format(self.scene_data_table),
                (row['provider'], row['scene_id'], row['cell_id'],
                 row['season'], row.get('global_col'), row.get('global_row'),
                 row.get('url'), row.get('tms_url')))

    def exists_row(self, cell_id, season, provider='planet'):
        if self.enabled and self.skip_existing:
            try:
                curs = self.conn.cursor()
                curs.execute(
                    """SELECT FROM %s WHERE provider = '%s' AND cell_id = %s AND season = '%s' and tms_url <> ''"""
                    % (self.scene_data_table, provider, cell_id, season))
                return curs.fetchone() is not None
            except:
                self.logger.exception('Error Encountered')
                return False
        else:
            return False

    def insert_rows_by_one(self, rows):
        if self.enabled:
            # [provider] | scene_id | [cell_id | season] | global_col | global_row | url | tms_url | date_time
            curs = self.conn.cursor()
            for row in rows:
                try:
                    curs.execute(
                        """INSERT INTO {} VALUES (%s, %s, %s, %s, %s, %s, %s, %s, now())"""
                        .format(self.scene_data_table),
                        (row[0], row[1], row[2], row[3], row[4], row[5],
                         row[6], row[7]))
                except psycopg2.IntegrityError:
                    self.conn.rollback()
                    try:
                        curs.execute(
                            """
                                UPDATE {} SET scene_id = %s, global_col = %s, global_row = %s, url = %s, tms_url = %s, date_time = now()
                                WHERE provider = %s AND cell_id = %s AND season = %s
                                """.format(self.scene_data_table),
                            (row[1], row[4], row[5], row[6], row[7], row[0],
                             row[2], row[3]))
                    except psycopg2.IntegrityError:
                        self.conn.rollback()
                    else:
                        self.conn.commit()
                else:
                    self.conn.commit()

    def insert_rows_by_one_async(self, rows):
        if self.enabled:
            self.query_executor.submit(self.insert_rows_by_one, rows)

    def insert_rows(self, rows):
        if self.enabled:
            try:
                curs = self.conn.cursor()
                # [provider] | scene_id | [cell_id | season] | global_col | global_row | url | tms_url | date_time
                args_str = ','.join(
                    curs.mogrify("%s", (row, )).decode('utf8') for row in rows)
                curs.execute("INSERT INTO {} VALUES {}".format(
                    self.scene_data_table,
                    args_str))  #  ON CONFLICT DO NOTHING PSQL 9.5 only
                self.conn.commit()
            except:
                self.conn.rollback()

    def query_without_tms_url(self, limit=None):
        curs = self.get_cursor()
        query = ""

        if limit == None:
            query = """SELECT * FROM %s WHERE tms_url = ''""" % (
                self.master_grid_table)
        else:
            query = """SELECT * FROM %s WHERE tms_url = '' LIMIT %s""" % (
                self.master_grid_table, limit)

        curs.execute(query)

        return curs

    def drain(self):
        self.query_executor.drain()

    def close(self):
        self.query_executor.close()