def test_mock_from_point_query(self): """ Checks the real PAIRS point query service against the mock used. """ # get real data self.pairsServerMock.stop() testPointQueryRasterReal = paw.PAIRSQuery( json.load( open( os.path.join(TEST_DATA_DIR, 'point-data-sample-request-raster.json'))), 'https://' + PAIRS_SERVER, auth=PAIRS_CREDENTIALS, baseURI=PAIRS_BASE_URI, ) testPointQueryVectorReal = paw.PAIRSQuery( json.load( open( os.path.join(TEST_DATA_DIR, 'point-data-sample-request-vector.json'))), 'https://' + PAIRS_SERVER, auth=PAIRS_CREDENTIALS, baseURI=PAIRS_BASE_URI, ) self.pairsServerMock.start() # get mock data testPointQueryRasterMock = paw.PAIRSQuery( json.load( open( os.path.join(TEST_DATA_DIR, 'point-data-sample-request-raster.json'))), 'https://' + PAIRS_SERVER, auth=PAIRS_CREDENTIALS, baseURI=PAIRS_BASE_URI, ) testPointQueryVectorMock = paw.PAIRSQuery( json.load( open( os.path.join(TEST_DATA_DIR, 'point-data-sample-request-vector.json'))), 'https://' + PAIRS_SERVER, auth=PAIRS_CREDENTIALS, baseURI=PAIRS_BASE_URI, ) # compare data entry keys self.assertListEqual( sorted( testPointQueryRasterReal.querySubmit.json()['data'][0].keys()), sorted( testPointQueryRasterMock.querySubmit.json()['data'][0].keys()), ) self.assertListEqual( sorted( testPointQueryVectorReal.querySubmit.json()['data'][0].keys()), sorted( testPointQueryVectorMock.querySubmit.json()['data'][0].keys()), )
def TO_BE_IMPLEMENTED_test_dataframe_generation(self): """ Tests functions that massage the received data to the *unified* PAW dataframe. """ # query mocked data logging.info( "TEST: Generation of unified PAW dataframe for raster data.") testRasterQuery = paw.PAIRSQuery( json.load( open( os.path.join(TEST_DATA_DIR, 'raster-data-sample-request.json'))), 'https://' + PAIRS_SERVER, auth=PAIRS_CREDENTIALS, baseURI=PAIRS_BASE_URI, ) testRasterQuery.submit() testRasterQuery.poll_till_finished(printStatus=True) testRasterQuery.download() # create dataframe from ratser data testRasterQuery.create_dataframe() # check that the dataset and datalayer column names have been added self.assertIn( 'layerName', testRasterQuery.dataframe[list( testRasterQuery.metadata.keys())[0]].columns)
def create_pairs_query(query, config): query_dict = create_pairs_query_dict(query) print(query_dict) pairs_query = paw.PAIRSQuery(query_dict, config['server'], (config['username'], config['password']), downloadDir=config['download_dir']) return pairs_query
def test_from_point_query_vector(self): """ Test querying vector point data. """ # query mocked data logging.info("TEST: Query (mocked) point data.") # define point query testPointQuery = paw.PAIRSQuery( json.load( open( os.path.join(TEST_DATA_DIR, 'point-data-sample-request-vector.json'))), 'https://' + PAIRS_SERVER, auth=PAIRS_CREDENTIALS, baseURI=PAIRS_BASE_URI, ) # submit point query testPointQuery.submit() # for complience with general PAW query scheme, perform fake poll and download testPointQuery.poll_till_finished() testPointQuery.download() testPointQuery.create_layers() # check vector data frame ## number of data points is correct logging.info("TEST: Perform vector data frame tests.") self.assertEqual(2, len(testPointQuery.vdf)) ## column names agree with data response self.assertListEqual( sorted(list(testPointQuery.querySubmit.json()['data'][0].keys())), sorted(testPointQuery.vdf.columns), ) ## check (some) data types from response self.assertIsInstance( testPointQuery.vdf.timestamp[0], datetime.datetime, ) self.assertIsInstance( testPointQuery.vdf.value[0], string_type, ) # check property string column splitting colsBeforeSplit = len(testPointQuery.vdf.columns) testPointQuery.split_property_string_column() colsAfterSplit = len(testPointQuery.vdf.columns) if paw.PROPERTY_STRING_COL_NAME_POINT in testPointQuery.vdf.columns: self.assertLess(colsBeforeSplit, colsAfterSplit) else: self.assertEqual(colsBeforeSplit, colsAfterSplit) # run twice to double-check it is not increasing the number of columns testPointQuery.split_property_string_column() colsAfter2ndSplit = len(testPointQuery.vdf.columns) self.assertEqual(colsAfterSplit, colsAfter2ndSplit)
def test_from_point_query_raster(self): """ Test querying raster point data. """ # query mocked data logging.info("TEST: Query (mocked) point data.") # define point query testPointQuery = paw.PAIRSQuery( json.load( open( os.path.join(TEST_DATA_DIR, 'point-data-sample-request-raster.json'))), 'https://' + PAIRS_SERVER, auth=PAIRS_CREDENTIALS, baseURI=PAIRS_BASE_URI, ) # submit point query testPointQuery.submit() # for complience with general PAW query scheme, perform fake poll and download testPointQuery.poll_till_finished() testPointQuery.download() testPointQuery.create_layers() # try to split property string column (although having no effect, it should run through) colsBeforeSplit = len(testPointQuery.vdf.columns) testPointQuery.split_property_string_column() colsAfterSplit = len(testPointQuery.vdf.columns) self.assertEqual(colsBeforeSplit, colsAfterSplit) # check vector data frame ## number of data points is correct logging.info("TEST: Perform vector data frame tests.") self.assertEqual(2, len(testPointQuery.vdf)) ## column names agree with data response self.assertListEqual( sorted( list(testPointQuery.querySubmit.json()['data'][0].keys()) \ + [paw.PAIRS_VECTOR_GEOMETRY_COLUMN_NAME] ), sorted(testPointQuery.vdf.columns), ) ## check (some) data types from response self.assertIsInstance( testPointQuery.vdf.longitude[0], float, ) self.assertIsInstance( testPointQuery.vdf.timestamp[0], datetime.datetime, ) self.assertIsInstance( testPointQuery.vdf.value[0], string_type, )
def query_local(layerID): coronaQueryDef = { "layers": [ {"id": layerID}, ], "spatial": { "type" : "square", "coordinates" : [-89, -179, 89, 179], }, "temporal": { "intervals": [ { "start": "2019-03-01T00:00:00Z", "end": "2030-03-10T23:59:59Z" } ] }, "outputType": "csv" } # create PAIRS query instance coronaQuery = paw.PAIRSQuery( coronaQueryDef, pairsHost = 'https://'+PAIRS_SERVER, auth = PAIRS_CREDENTIALS, baseURI = BASE_URI, inMemory = True, ) # submit and download modified query coronaQuery.submit() coronaQuery.poll_till_finished(printStatus=True) coronaQuery.download() coronaQuery.create_layers() # associate vector data frame, and show the vector data coronaQuery.vdf = coronaQuery.data[list(coronaQuery.metadata.keys())[0]] # split property string into individual columns #coronaQuery.split_property_string_column() new = coronaQuery.vdf['Region'].str.replace(':', '.').str.split('.', expand=True) coronaQuery.vdf['pairs_id'] = new[0] coronaQuery.vdf['State'] = new[1] coronaQuery.vdf['County'] = new[2] return coronaQuery
def __init__(self, queryList, auth=None, downloadDir='./downloads', overwriteExisting=False, maxConcurrent=2, logEverySeconds=30): if maxConcurrent > MAX_CONCURRENT: raise Exception('Maximum value for maxConcurrent is {}.'.format( MAX_CONCURRENT)) self.maxConcurrent = maxConcurrent self.logEverySeconds = logEverySeconds self.queries = { 'queued': deque(), 'running': deque(), 'completed': deque(), 'failed': deque() } for q in queryList: if isinstance(q, paw.PAIRSQuery): if q.querySubmit is None: self.queries['queued'].append(q) elif q.queryStatus is None: self.queries['running'].append(q) elif q.queryStatus.json()['statusCode'] < 20: self.queries['running'].append(q) elif q.queryStatus.json()['statusCode'] == 20: self.queries['completed'].append(q) elif q.queryStatus.json()['statusCode'] > 20: self.queries['failed'].append(q) else: raise Exception( 'Cannot determine status of PAIRSQuery object.') else: self.queries['queued'].append( paw.PAIRSQuery(q, auth=auth, downloadDir=downloadDir, overwriteExisting=overwriteExisting))
def test_dataframe_generation(self): """ Tests functions that massage the received data to the *unified* PAW dataframe. """ # query mocked data logging.info( "TEST: Generation of unified PAW dataframe for point data.") testPointQuery = paw.PAIRSQuery( json.load( open( os.path.join(TEST_DATA_DIR, 'point-data-sample-request-raster.json'))), 'https://' + PAIRS_SERVER, auth=PAIRS_CREDENTIALS, baseURI=PAIRS_BASE_URI, ) # submit query testPointQuery.submit() # set timestamp column testPointQuery.set_timestamp_column('timestamp') # set point coordinate columns testPointQuery.set_lat_lon_columns('latitude', 'longitude', 'geometry')
def query_PAIRS(query_json, raw_data_path, path_to_credentials='./ibmpairspass.txt'): """ Sends a request to PAIRS server and downloads the images in the area specified by coords. The raw images are saved in RAW_DATA_PATH """ with open(path_to_credentials, 'r') as creds: creds = creds.read().split(':') # PAIRS server, and authentication pairs_server, user_name, password = creds pairs_server = 'https://' + pairs_server pairs_auth = (user_name, password) # Make request to IBM server for images from area within coordinates query = paw.PAIRSQuery( query_json, pairs_server, pairs_auth, baseURI='/', downloadDir=raw_data_path ) # Submit query and wait until downloaded query.submit() query.poll_till_finished() query.download() query.create_layers() # Sort in reverse to get channels in R, G, B order. data_keys = sorted(query.data.keys(), reverse=True) images = [query.data[k] for k in data_keys] # Delete the zip file. zip_file_path = os.path.join(raw_data_path, query.zipFilePath) os.remove(zip_file_path) return images
def __init__(self, queryList, auth=None, downloadDir='./downloads', overwriteExisting=False, maxConcurrent=2, logEverySeconds=30): ''' :param queryList: list containing a mix of PAIRS query JSONs and paw.PAIRSQuery objects. For paw.PAIRSQuery objects, only those which have not been submitted yet will be submitted. :type queryList: list :param auth: user name and password as tuple for access to pairsHost :type auth: (str, str) :param overwriteExisting: destroy locally cached data, if existing, otherwise grab the latest locally cached data, `latest` is defined by alphanumerical ordering of the PAIRS query ID :type overwriteExisting: bool :param downloadDir: directory where to store downloaded data :type downloadDir: str :param maxConcurrent: maximum number of concurrent queries. Note that the maximum number of concurrent queries might be limited server side for a particular user. There is no guarantee that a user can submit maxConcurrent queries at a given time. :type maxConcurrent: int :param logEverySeconds: time interval at which the class will send status messages to its logger in seconds (logging.INFO) :type logEverySeconds: int ''' if maxConcurrent > MAX_CONCURRENT: raise Exception('Maximum value for maxConcurrent is {}.'.format( MAX_CONCURRENT)) self.maxConcurrent = maxConcurrent self.logEverySeconds = logEverySeconds self.queries = { 'queued': deque(), 'running': deque(), 'completed': deque(), 'failed': deque() } for q in queryList: if isinstance(q, paw.PAIRSQuery): if q.querySubmit is None: self.queries['queued'].append(q) elif q.queryStatus is None: self.queries['running'].append(q) elif q.queryStatus.json()['statusCode'] < 20: self.queries['running'].append(q) elif q.queryStatus.json()['statusCode'] == 20: self.queries['completed'].append(q) elif q.queryStatus.json()['statusCode'] > 20: self.queries['failed'].append(q) else: raise Exception( 'Cannot determine status of PAIRSQuery object.') else: self.queries['queued'].append( paw.PAIRSQuery(q, auth=auth, downloadDir=downloadDir, overwriteExisting=overwriteExisting))
def test_mock_raster_query(self): """ Checks the real PAIRS raster query service against the mock used. """ # get real data # prevent the responses module to complain about unused URL endponts of the mock try: self.pairsServerMock.stop() except Exception as e: # catch not all requests called error logging.warning( 'Stopping the mocked PAIRS server caused (potentially irrelevant) trouble: {}' .format(e)) # check query submit logging.info("TEST: Perform query to real PAIRS server.") subResp = requests.post( 'https://' + PAIRS_SERVER + PAIRS_BASE_URI + QUERY_ENDPOINT, json=json.load( open( os.path.join(TEST_DATA_DIR, 'raster-data-sample-request.json'))), auth=PAIRS_CREDENTIALS, ).json() self.assertIn('id', subResp.keys()) self.assertIsInstance(subResp['id'], string_type) # check query poll while True: statResp = requests.get( 'https://' + PAIRS_SERVER + PAIRS_BASE_URI + STATUS_ENDPOINT + subResp['id'], auth=PAIRS_CREDENTIALS, ).json() assert set(['id', 'rtStatus', 'statusCode']) <= set( statResp.keys()) self.assertIsInstance(statResp['statusCode'], int) if statResp['statusCode'] >= 20: break # check query result downloadResp = requests.get( 'https://' + PAIRS_SERVER + PAIRS_BASE_URI + DOWNLOAD_ENDPOINT + subResp['id'], auth=PAIRS_CREDENTIALS, stream=True, ) pairsDataZip = '/tmp/pairs-test-raster-download-{}.zip'.format( subResp['id']) with open(pairsDataZip, 'wb') as f: for chunk in downloadResp.iter_content(chunk_size=1024): if chunk: f.write(chunk) self.pairsServerMock.start() # basic test of real data self.assertTrue(zipfile.is_zipfile(pairsDataZip)) # get mock data testRasterQuery = paw.PAIRSQuery( json.load( open( os.path.join(TEST_DATA_DIR, 'raster-data-sample-request.json'))), 'https://' + PAIRS_SERVER, auth=PAIRS_CREDENTIALS, baseURI=PAIRS_BASE_URI, ) testRasterQuery.submit() testRasterQuery.poll_till_finished(printStatus=True) testRasterQuery.download() pairsMockZip = testRasterQuery.queryDir + '.zip' # make sure that files in mock are available in real download # and that the size of the data and the mock are approximately the same logging.info( "TEST: Check that all files from the mock exist in the real data queried." ) with zipfile.ZipFile(pairsMockZip, 'r') as mock, \ zipfile.ZipFile(pairsDataZip, 'r') as real: # generate info dictionaries mockInfo = {f.filename: f.file_size for f in mock.infolist()} realInfo = {f.filename: f.file_size for f in real.infolist()} # check that files in mock are contained in real data (in terms of names) assert set(mockInfo.keys()) <= set(realInfo.keys()) # check that file sizes are approximately the same for key in mockInfo.keys(): self.assertAlmostEqual(mockInfo[key], realInfo[key], delta=self.REL_FILESIZE_DEV * realInfo[key])
def vector_query(self, useLocalZip=False): """ Query vector data in various ways. """ # query mocked data logging.info("TEST: Query (mocked) data.") testVectorQuery = paw.PAIRSQuery( json.load(open(os.path.join(TEST_DATA_DIR,'vector-data-sample-request.json'))) \ if not useLocalZip else self.PAIRS_VECTOR_ZIP_PATH, 'https://'+PAIRS_SERVER, auth = PAIRS_CREDENTIALS, baseURI = PAIRS_BASE_URI, ) # check that query got submitted testVectorQuery.submit() if not useLocalZip: self.assertTrue(testVectorQuery.querySubmit.ok) # poll and check that data status is finished testVectorQuery.poll_till_finished(printStatus=True) if not useLocalZip: self.assertTrue(testVectorQuery.queryStatus.ok) # check that certain files exist testVectorQuery.download() self.assertTrue(os.path.exists(testVectorQuery.zipFilePath)) logging.info("TEST: Check files downloaded.") with zipfile.ZipFile(testVectorQuery.zipFilePath) as zf: pass # test the existence of the basic meta file # ATTENTION: disabled for now, because it needs to be implemented #for fileName in ['output.info', ]: # self.assertTrue( # fileName in zf.namelist() # ) # load raster meta data logging.info("TEST: Load vector meta data.") testVectorQuery.list_layers() # check that all data are listed as type vector self.assertTrue( all([ 'vector' == meta['layerType'] for meta in testVectorQuery.metadata.values() ])) logging.info("TEST: Create dataframe from raster data.") # load the raster data into a NumPy array testVectorQuery.create_layers() # access the vector dataframe for name, meta in testVectorQuery.metadata.items(): if meta['layerType'] == 'vector': self.assertIsInstance( testVectorQuery.data[name], pandas.DataFrame, ) # try to split property string column (if any) testVectorQuery.vdf = testVectorQuery.data[name] # check property string column splitting colsBeforeSplit = len(testVectorQuery.vdf.columns) testVectorQuery.split_property_string_column() colsAfterSplit = len(testVectorQuery.vdf.columns) if paw.PROPERTY_STRING_COL_NAME in testVectorQuery.vdf.columns: self.assertLess(colsBeforeSplit, colsAfterSplit) else: self.assertEqual(colsBeforeSplit, colsAfterSplit) # run twice to double-check it is not increasing the number of columns testVectorQuery.split_property_string_column() colsAfter2ndSplit = len(testVectorQuery.vdf.columns) self.assertEqual(colsAfterSplit, colsAfter2ndSplit) # check that the data acknowledgement statement is not empty self.assertIsNotNone(testVectorQuery.dataAcknowledgeText)
def raster_aggregation_query(self, useLocalZip=False): """ Query aggregated raster data. """ # query mocked data logging.info("TEST: Query (mocked) aggregation data.") testRasterAggQuery = paw.PAIRSQuery( json.load(open(os.path.join(TEST_DATA_DIR,'aggregation-data-sample-request.json'))) \ if not useLocalZip else self.PAIRS_AGG_RASTER_ZIP_PATH, 'https://'+PAIRS_SERVER, auth = PAIRS_CREDENTIALS, baseURI = PAIRS_BASE_URI, ) # check that query got submitted testRasterAggQuery.submit() if not useLocalZip: self.assertTrue(testRasterAggQuery.querySubmit.ok) # poll and check that data status is finished testRasterAggQuery.poll_till_finished(printStatus=True) if not useLocalZip: self.assertTrue(testRasterAggQuery.queryStatus.ok) # check that certain files exist testRasterAggQuery.download() self.assertTrue(os.path.exists(testRasterAggQuery.zipFilePath)) logging.info("TEST: Check files downloaded.") with zipfile.ZipFile(testRasterAggQuery.zipFilePath) as zf: # test the existence of the basic meta file for fileName in [ 'output.info', ]: self.assertTrue(fileName in zf.namelist()) # check that for each aggregated CSV file there exists a corresonding JSON meta file for rasterFilePath in zf.namelist(): # find all PAIRS GeoTiff files if rasterFilePath.endswith('.csv'): # check a corresponding JSON file exists self.assertTrue(rasterFilePath + '.json' in zf.namelist()) # try to temporarily open the JSON file json.loads(zf.read(rasterFilePath + '.json')) # load aggregated raster meta data (which are actually vector-type data!) logging.info("TEST: Load aggregated raster meta data.") testRasterAggQuery.list_layers() # check that 'details' of raster data have been successfully loaded by # getting the spatial reference information self.assertIsInstance( list(testRasterAggQuery.metadata.values())[0]["details"] ["spatialRef"], string_type) # check that all data are listed as type vector self.assertTrue( all([ 'vector' == meta['layerType'] for meta in testRasterAggQuery.metadata.values() ])) logging.info( "TEST: Create Pandas dataframes from aggregated raster data.") # load the aggregated raster data as vector data into Pandas dataframes testRasterAggQuery.create_layers() # access the numpy array for name, meta in testRasterAggQuery.metadata.items(): if meta['layerType'] == 'vector': self.assertIsInstance( testRasterAggQuery.data[name], pandas.DataFrame, ) # check that the data acknowledgement statement is not empty self.assertIsNotNone(testRasterAggQuery.dataAcknowledgeText)