def on_start(self):

        ImmediateProcess.on_start(self)

        # necessary arguments, passed in via configuration kwarg to schedule_process. process namespace to avoid collisions
        fuc_id = self.CFG.get_safe('process.fuc_id',None) # FileUploadContext ID

        # Clients
        self.object_store = self.container.object_store

        # run process
        self.process(fuc_id)
    def on_start(self):

        ImmediateProcess.on_start(self)

        # necessary arguments, passed in via configuration kwarg to schedule_process. process namespace to avoid collisions
        fuc_id = self.CFG.get_safe('process.fuc_id',None) # FileUploadContext ID

        # Clients
        self.object_store = self.container.object_store

        # run process
        self.process(fuc_id)
    def on_start(self):

        ImmediateProcess.on_start(self)

        # necessary arguments, passed in via configuration kwarg to schedule_process. process namespace to avoid collisions
        fuc_id = self.CFG.get_safe('process.fuc_id',None) # FileUploadContext ID

        # Clients
        self.object_store = self.container.object_store
        self.event_publisher = EventPublisher(OT.ResetQCEvent)

        # run process
        self.process(fuc_id)

        # cleanup
        self.event_publisher.close()
    def on_start(self):

        ImmediateProcess.on_start(self)

        # necessary arguments, passed in via configuration kwarg to schedule_process. process namespace to avoid collisions
        fuc_id = self.CFG.get_safe('process.fuc_id',None) # FileUploadContext ID

        # Clients
        self.object_store = self.container.object_store
        self.resource_registry = self.container.resource_registry
        self.event_publisher = EventPublisher(OT.ResetQCEvent)
        self.data_product_management = DataProductManagementServiceProcessClient(process=self)
        self.create_map()

        # run process
        if fuc_id:
            self.process(fuc_id)

        # cleanup
        self.event_publisher.close()
    def on_start(self):

        ImmediateProcess.on_start(self)

        # necessary arguments, passed in via configuration kwarg to schedule_process. process namespace to avoid collisions
        fuc_id = self.CFG.get_safe('process.fuc_id',
                                   None)  # FileUploadContext ID

        # Clients
        self.object_store = self.container.object_store
        self.resource_registry = self.container.resource_registry
        self.event_publisher = EventPublisher(OT.ResetQCEvent)
        self.data_product_management = DataProductManagementServiceProcessClient(
            process=self)
        self.create_map()

        # run process
        if fuc_id:
            self.process(fuc_id)

        # cleanup
        self.event_publisher.close()
    def on_start(self):

        ImmediateProcess.on_start(self)

        # necessary arguments, passed in via configuration kwarg to schedule_process. process namespace to avoid collisions
        fuc_id = self.CFG.get_safe('process.fuc_id',None) # FileUploadContext ID
        dp_id = self.CFG.get_safe('process.dp_id',None) # DataProduct ID

        # clients we'll need
        resource_registry = self.container.resource_registry
        object_store = self.container.object_store
        dataset_management =  DatasetManagementServiceClient()
        data_product_management = DataProductManagementServiceClient()

        # get the Object (dict) containing details of the uploaded file
        fuc = object_store.read(fuc_id)

        # get the ParameterContexts associated with this DataProduct
        sd_id = resource_registry.find_objects(dp_id, PRED.hasStreamDefinition, id_only=True)[0][0] # TODO loop
        pd_id = resource_registry.find_objects(sd_id, PRED.hasParameterDictionary, id_only=True)[0][0] # TODO loop
        pc_list, _ = resource_registry.find_objects(pd_id, PRED.hasParameterContext, id_only=False) # parameter contexts

        # NetCDF file open here
        nc_filename = fuc.get('path', None)
        if nc_filename is None:
            raise BadRequest("uploaded file has no path")

        # keep track of the number of fields we actually process
        nfields = 0

        with netCDF4.Dataset(nc_filename,'r') as nc:

            nc_time = nc.variables['time'][:] # don't modify nc_time below, read once use many times

            for v in nc.variables:
                variable = nc.variables[v]
                nc_name = str(v) # name of variable should be the same as what was downloaded, we'll append the c here
                # check for REQUIRED attributes
                author = getattr(variable, 'author', None)
                reason = getattr(variable, 'reason', None)
                if not all([author,reason]):
                    log.info('skipping parameter %s - no author or reason' % nc_name)
                    continue
                # get all ParameterContexts (from pc_list) with this 'name' (should be one at the moment)
                pc_matches_nc_name_list = [c for c in pc_list if c.name == nc_name]
                # is variable already present?
                if len(pc_matches_nc_name_list) < 1:
                    log.info('skipping parameter %s - not found in ParameterContexts associated with DataProduct' % nc_name)
                    continue

                # we are using this ParameterContext as a copy
                pc = pc_matches_nc_name_list[0] # TODO should only have 1 context per 'name' but could be checked for completeness
                # only allow L1/L2 paramters (check against ooi_short_name which should end with this)
                m = re.compile('(_L[12])$').search(pc.ooi_short_name.upper()) # capture L1/L2 for use in new name
                if not m: # if not _L1 or _L2 move on
                    log.info('skipping parameter %s - not L1 or L2' % nc_name)
                    continue
                processing_level = m.group(1)
                # remove attributes we should not copy [_id,_rev,ts_created,ts_updated]
                delattr(pc, '_id')
                delattr(pc, '_rev')
                delattr(pc, 'ts_created')
                delattr(pc, 'ts_updated')
                # append L[12]c to name attribute (new parameter name)
                c_name = ''.join([pc['name'],processing_level,'c'])
                pc['name'] = c_name
                # copy attributes from NetCDF file
                pc['units'] = variable.units
                pc['value_encoding'] = str(variable.dtype)
                #TODO ERDAP files don't have fill_value, but should probably get from there, leaving copy for now
                # create ParameterContext
                pc_id = dataset_management.create_parameter(pc)
                data_product_management.add_parameter_to_data_product(pc_id,dp_id)

                # get NetCDF data for this variable
                nc_data = variable[:]

                with DirectCoverageAccess() as dca:
                    # get the Dataset IDs associated with this DataProduct
                    ds_id_list, _ = resource_registry.find_objects(dp_id, PRED.hasDataset, id_only=True)
                    for ds_id in ds_id_list: # could be multiple Datasets for this DataProduct
                        with dca.get_editable_coverage(ds_id) as cov: # <-- This pauses ingestion
                            # times in this Dataset
                            cov_time = cov._range_value['time'][:]
                            # subset nc_time (only where nc_time matches cov_time)
                            nc_indicies = [i for i,x in enumerate(nc_time) if x in cov_time]
                            subset_nc_time = nc_time[nc_indicies] + 2208988800 # TODO REMOVE THIS? ERDAP 1970 vs NTP 1900
                            # don't forget to subset the data too
                            subset_nc_data = [nc_data[i] for i in nc_indicies]
                            # use indicies of where subset_nc_time exists in cov_time to update coverage
                            cov_indicies = np.flatnonzero(np.in1d(cov_time, subset_nc_time)) # returns numpy.ndarray of indicies
                            cov_indicies = list(cov_indicies) # converts to list for coverage
                            #cov._range_value[c_name][cov_indicies] = subset_nc_data # TODO this should eventually work
                            for i,x in enumerate(cov_indicies):
                                cov._range_value[c_name][x] = subset_nc_data[i]

                nfields = nfields + 1

        fuc['status'] = 'UploadDataProcessing process complete - %d fields created/updated' % nfields
        self.container.object_store.update_doc(fuc)

        # remove uploaded file
        try:
            os.remove(nc_filename)
        except OSError:
            pass # TODO take action to get this removed
Beispiel #7
0
    def on_start(self):

        ImmediateProcess.on_start(self)

        # necessary arguments, passed in via configuration kwarg to schedule_process. process namespace to avoid collisions
        fuc_id = self.CFG.get_safe('process.fuc_id',None) # FileUploadContext ID
        dp_id = self.CFG.get_safe('process.dp_id',None) # DataProduct ID

        # clients we'll need
        resource_registry = self.container.resource_registry
        object_store = self.container.object_store
        dataset_management =  DatasetManagementServiceClient()
        data_product_management = DataProductManagementServiceClient()

        # get the Object (dict) containing details of the uploaded file
        fuc = object_store.read(fuc_id)

        # get the ParameterContexts associated with this DataProduct
        sd_id = resource_registry.find_objects(dp_id, PRED.hasStreamDefinition, id_only=True)[0][0] # TODO loop
        pd_id = resource_registry.find_objects(sd_id, PRED.hasParameterDictionary, id_only=True)[0][0] # TODO loop
        pc_list, _ = resource_registry.find_objects(pd_id, PRED.hasParameterContext, id_only=False) # parameter contexts

        # NetCDF file open here
        nc_filename = fuc.get('path', None)
        if nc_filename is None:
            raise BadRequest("uploaded file has no path")

        # keep track of the number of fields we actually process
        nfields = 0

        with netCDF4.Dataset(nc_filename,'r') as nc:

            nc_time = nc.variables['time'][:] # don't modify nc_time below, read once use many times

            for v in nc.variables:
                variable = nc.variables[v]
                nc_name = str(v) # name of variable should be the same as what was downloaded, we'll append the c here
                # check for REQUIRED attributes
                author = getattr(variable, 'author', None)
                reason = getattr(variable, 'reason', None)
                if not all([author,reason]):
                    log.info('skipping parameter %s - no author or reason' % nc_name)
                    continue
                # get all ParameterContexts (from pc_list) with this 'name' (should be one at the moment)
                pc_matches_nc_name_list = [c for c in pc_list if c.name == nc_name]
                # is variable already present?
                if len(pc_matches_nc_name_list) < 1:
                    log.info('skipping parameter %s - not found in ParameterContexts associated with DataProduct' % nc_name)
                    continue

                # we are using this ParameterContext as a copy
                pc = pc_matches_nc_name_list[0] # TODO should only have 1 context per 'name' but could be checked for completeness
                # only allow L1/L2 paramters (check against ooi_short_name which should end with this)
                m = re.compile('(_L[12])$').search(pc.ooi_short_name.upper()) # capture L1/L2 for use in new name
                if not m: # if not _L1 or _L2 move on
                    log.info('skipping parameter %s - not L1 or L2' % nc_name)
                    continue
                processing_level = m.group(1)
                # remove attributes we should not copy [_id,_rev,ts_created,ts_updated]
                delattr(pc, '_id')
                delattr(pc, '_rev')
                delattr(pc, 'ts_created')
                delattr(pc, 'ts_updated')
                # append L[12]c to name attribute (new parameter name)
                c_name = ''.join([pc['name'],processing_level,'c'])
                pc['name'] = c_name
                # copy attributes from NetCDF file
                pc['units'] = variable.units
                pc['value_encoding'] = str(variable.dtype)
                #TODO ERDAP files don't have fill_value, but should probably get from there, leaving copy for now
                # create ParameterContext
                pc_id = dataset_management.create_parameter(pc)
                data_product_management.add_parameter_to_data_product(pc_id,dp_id)

                # get NetCDF data for this variable
                nc_data = variable[:]

                with DirectCoverageAccess() as dca:
                    # get the Dataset IDs associated with this DataProduct
                    ds_id_list, _ = resource_registry.find_objects(dp_id, PRED.hasDataset, id_only=True)
                    for ds_id in ds_id_list: # could be multiple Datasets for this DataProduct
                        with dca.get_editable_coverage(ds_id) as cov: # <-- This pauses ingestion
                            # times in this Dataset
                            cov_time = cov.get_parameter_values(['time']).get_data()['time']
                            # subset nc_time (only where nc_time matches cov_time)
                            nc_indicies = [i for i,x in enumerate(nc_time) if x in cov_time]
                            subset_nc_time = nc_time[nc_indicies] + 2208988800 # TODO REMOVE THIS? ERDAP 1970 vs NTP 1900
                            # don't forget to subset the data too
                            subset_nc_data = [nc_data[i] for i in nc_indicies]
                            # use indicies of where subset_nc_time exists in cov_time to update coverage
                            cov_indicies = np.flatnonzero(np.in1d(cov_time, subset_nc_time)) # returns numpy.ndarray of indicies
                            cov_indicies = list(cov_indicies) # converts to list for coverage
                            #cov._range_value[c_name][cov_indicies] = subset_nc_data # TODO this should eventually work
                            for i,x in enumerate(cov_indicies):
                                cov._range_value[c_name][x] = subset_nc_data[i]

                nfields = nfields + 1

        fuc['status'] = 'UploadDataProcessing process complete - %d fields created/updated' % nfields
        self.container.object_store.update_doc(fuc)

        # remove uploaded file
        try:
            os.remove(nc_filename)
        except OSError:
            pass # TODO take action to get this removed