Beispiel #1
0
    def buildWorkflow(self, request: TaskRequest, node: WorkflowNode, inputs: EDASDatasetCollection )  -> EDASDatasetCollection:
        snode: SourceNode = node
        results = EDASDatasetCollection( "InputKernel.build-" + node.name )
        t0 = time.time()
        dset = self.getCachedDataset( snode )
        if dset is not None:
            self.importToDatasetCollection(results, request, snode, dset.xr )
            self.logger.info( "Access input data from cache: " + dset.id )
        else:
            dataSource: DataSource = snode.varSource.dataSource
            if dataSource.type == SourceType.collection:
                from edas.collection.agg import Axis as AggAxis, File as AggFile
                collection = Collection.new( dataSource.address )
                self.logger.info("Input collection: " + dataSource.address )
                aggs = collection.sortVarsByAgg( snode.varSource.vids )
                domain = request.operationManager.domains.getDomain( snode.domain )
                if domain is not None:
                    timeBounds = domain.findAxisBounds(Axis.T)
                    startDate = None if (domain is None or timeBounds is None) else TimeConversions.parseDate(timeBounds.start)
                    endDate   = None if (domain is None or timeBounds is None) else TimeConversions.parseDate(timeBounds.end)
                else: startDate = endDate = None
                for ( aggId, vars ) in aggs.items():
                    use_chunks = True
                    pathList = collection.pathList(aggId) if startDate is None else collection.periodPathList(aggId,startDate,endDate)
                    assert len(pathList) > 0, f"No files found in aggregation {aggId} for date range {startDate} - {endDate} "
                    nFiles = len(pathList)
                    if use_chunks:
                        nReadPartitions = int( EdasEnv.get( "mfdataset.npartitions", 250 ) )
                        agg = collection.getAggregation(aggId)
                        nchunks, fileSize = agg.getChunkSize( nReadPartitions, nFiles )
                        chunk_kwargs = {} if nchunks is None else dict(chunks={"time": nchunks})
                        self.logger.info( f"Open mfdataset: vars={vars}, NFILES={nFiles}, FileSize={fileSize}, FILES[0]={pathList[0]}, chunk_kwargs={chunk_kwargs}, startDate={startDate}, endDate={endDate}, domain={domain}" )
                    else:
                        chunk_kwargs = {}
                        self.logger.info( f"Open mfdataset: vars={vars},  NFILES={nFiles}, FILES[0]={pathList[0]}" )
                    dset = xr.open_mfdataset( pathList, engine='netcdf4', data_vars=vars, parallel=True, **chunk_kwargs )
                    self.logger.info(f"Import to collection")
                    self.importToDatasetCollection( results, request, snode, dset )
                    self.logger.info(f"Collection import complete.")
            elif dataSource.type == SourceType.file:
                self.logger.info( "Reading data from address: " + dataSource.address )
                files = glob.glob( dataSource.address )
                parallel = len(files) > 1
                assert len(files) > 0, f"No files matching path {dataSource.address}"
                dset = xr.open_mfdataset(dataSource.address, engine='netcdf4', data_vars=snode.varSource.ids, parallel=parallel )
                self.importToDatasetCollection(results, request, snode, dset)
            elif dataSource.type == SourceType.archive:
                self.logger.info( "Reading data from archive: " + dataSource.address )
                dataPath =  request.archivePath( dataSource.address )
                dset = xr.open_mfdataset( [dataPath] )
                self.importToDatasetCollection(results, request, snode, dset)
            elif dataSource.type == SourceType.dap:
                nchunks = request.runargs.get( "ncores", 8 )
                self.logger.info( f" --------------->>> Reading data from address: {dataSource.address}, nchunks = {nchunks}" )
#                dset = xr.open_mfdataset( [dataSource.address], engine="netcdf4", data_vars=snode.varSource.ids, chunks={"time":nchunks} )
                dset = xr.open_dataset( dataSource.address, engine="netcdf4", chunks={"time":nchunks} )
                self.importToDatasetCollection( results, request, snode, dset )
            self.logger.info( f"Access input data source {dataSource.address}, time = {time.time() - t0} sec" )
            self.logger.info( "@L: LOCATION=> host: {}, thread: {}, proc: {}".format( socket.gethostname(), threading.get_ident(), os.getpid() ) )
        return results
Beispiel #2
0
 def getCapabilitiesXml(self, type: str) -> str:
     from edas.collection.agg import Collection
     if type == None: type = "kernels"
     self.logger.info(" GetCapabilities --> type: " + str(type))
     if (type.lower().startswith("ker")):
         specs = [
             opMod.getCapabilitiesXml()
             for opMod in self.operation_modules.values()
         ]
         return '<modules> {} </modules>'.format(" ".join(specs))
     elif (type.lower().startswith("col")):
         specs = Collection.getCollectionsList()
         return '<collection> {} </collection>'.format(" ".join(specs))
     elif (type.lower().startswith("var")):
         type_toks = type.split("|")
         collection = Collection.new(type_toks[1])
         return collection.getVariableSpec(type_toks[2])
     else:
         raise Exception("Unknown capabilities type: " + type)
Beispiel #3
0
 def getVariableSpec(self, collId: str, varId: str) -> Dict:
     from edas.collection.agg import Collection
     col = Collection.new(collId)
     varSpec = col.getVariableSpec(varId)
     return Message("var", "VariableSpec", varSpec).dict()
import time, traceback
from dask.distributed import Client
from edas.collection.agg import Collection

print("STARTUP")
client = None
start = time.time()
collection = "cip_merra2_mth"
varName = 'KE'

try:
    client = Client('cldradn101:8786')

    print("READ " + collection)

    collection = Collection.new(collection)
    ds = xa.open_mfdataset(collection.pathList(varName),
                           data_vars=['KE'],
                           parallel=True)

    print("COMPUTE MEAN, Result:")

    lat_bnds, lon_bnds = [40, 43], [-96, -89]  # use CONUS bounds
    ds.sel(lat=slice(*lat_bnds), lon=slice(*lon_bnds))

    print(ds.KE.mean().values)

    print(" Completed computation in " + str(time.time() - start) + " seconds")

except Exception:
    traceback.print_exc()