Example #1
0
 def _splice_coverage(cls, dataset_id, scov):
     file_root = FileSystem.get_url(FS.CACHE, 'datasets')
     vcov = cls._get_coverage(dataset_id, mode='a')
     scov_pth = scov.persistence_dir
     if isinstance(vcov.reference_coverage, SimplexCoverage):
         ccov = ComplexCoverage(
             file_root,
             uuid4().hex,
             'Complex coverage for %s' % dataset_id,
             reference_coverage_locs=[
                 vcov.head_coverage_path,
             ],
             parameter_dictionary=ParameterDictionary(),
             complex_type=ComplexCoverageType.TEMPORAL_AGGREGATION)
         log.info('Creating Complex Coverage: %s', ccov.persistence_dir)
         ccov.append_reference_coverage(scov_pth)
         ccov_pth = ccov.persistence_dir
         ccov.close()
         vcov.replace_reference_coverage(ccov_pth)
     elif isinstance(vcov.reference_coverage, ComplexCoverage):
         log.info('Appending simplex coverage to complex coverage')
         #vcov.reference_coverage.append_reference_coverage(scov_pth)
         dir_path = vcov.reference_coverage.persistence_dir
         vcov.close()
         ccov = AbstractCoverage.load(dir_path, mode='a')
         ccov.append_reference_coverage(scov_pth)
         ccov.refresh()
         ccov.close()
     vcov.refresh()
     vcov.close()
Example #2
0
    def __init__(self, name = None):
        """
        @param name The name of the dataset
        """
        # generate a random name for the filename if it has not been provided.
        self.filename = FileSystem.get_url(fs=FS.TEMP, filename=name or random_name(), ext='encoder.hdf5')

        # Using inline imports to put off making hdf/numpy required dependencies
        import h5py

        # open an hdf file on disk - in /tmp to write data to since we can't yet do in memory
        try:
            log.debug("Creating h5py file object for the encoder at %s" % self.filename)
            if os.path.isfile(self.filename):
                # if file exists, then append to it
                self.h5pyfile = h5py.File(self.filename, mode = 'r+', driver='core')
            else:
                # if file does not already exist, write a new one
                self.h5pyfile = h5py.File(self.filename, mode = 'w', driver='core')
            assert self.h5pyfile, 'No h5py file object created.'
        except IOError:
            log.debug("Error opening file for the HDFEncoder! ")
            raise HDFEncoderException("Error while trying to open file. ")
        except AssertionError as err:
            log.debug(err.message)
            raise HDFEncoderException(err.message)
 def _create_coverage(self, dataset_id, description, parameter_dict, spatial_domain,temporal_domain):
     pdict = ParameterDictionary.load(parameter_dict)
     sdom = GridDomain.load(spatial_domain)
     tdom = GridDomain.load(temporal_domain)
     file_root = FileSystem.get_url(FS.CACHE,'datasets')
     scov = SimplexCoverage(file_root,dataset_id,description or dataset_id,parameter_dictionary=pdict, temporal_domain=tdom, spatial_domain=sdom, inline_data_writes=self.inline_data_writes)
     return scov
 def on_start(self):
     super(TransformCapture, self).on_start()
     #        #@todo: Remove debugging statements
     log.debug('(Transform: %s) Starting...', self.name)
     self.file_name = self.CFG.get_safe(
         'process.file_name', FileSystem.get_url(FS.TEMP,
                                                 'transform_output'))
Example #5
0
        def create_known(dataset_name, rootgrp_name, grp_name):
            """
            A known array to compare against during tests
            """

            known_array = numpy.random.rand(10, 20)

            filename = FileSystem.get_url(FS.TEMP, random_name(), ".hdf5")

            # Write an hdf file with known values to compare against
            h5pyfile = h5py.File(filename, mode='w', driver='core')
            grp = h5pyfile.create_group(rootgrp_name)
            subgrp = grp.create_group(grp_name)
            dataset = subgrp.create_dataset(dataset_name,
                                            known_array.shape,
                                            known_array.dtype.str,
                                            compression='gzip',
                                            compression_opts=4,
                                            maxshape=(None, None))

            dataset.write_direct(known_array)
            h5pyfile.close()

            # convert the hdf file into a binary string
            f = open(filename, mode='rb')
            # read the binary string representation of the file
            known_hdf_as_string = f.read(
            )  # this is a known string to compare against during tests
            f.close()
            # cleaning up
            FileSystem.unlink(f.name)

            return known_array, known_hdf_as_string
 def _splice_coverage(cls, dataset_id, scov):
     file_root = FileSystem.get_url(FS.CACHE,'datasets')
     vcov = cls._get_coverage(dataset_id,mode='a')
     scov_pth = scov.persistence_dir
     if isinstance(vcov.reference_coverage, SimplexCoverage):
         ccov = ComplexCoverage(file_root, uuid4().hex, 'Complex coverage for %s' % dataset_id, 
                 reference_coverage_locs=[vcov.head_coverage_path,],
                 parameter_dictionary=ParameterDictionary(),
                 complex_type=ComplexCoverageType.TEMPORAL_AGGREGATION)
         log.info('Creating Complex Coverage: %s', ccov.persistence_dir)
         ccov.append_reference_coverage(scov_pth)
         ccov_pth = ccov.persistence_dir
         ccov.close()
         vcov.replace_reference_coverage(ccov_pth)
     elif isinstance(vcov.reference_coverage, ComplexCoverage):
         log.info('Appending simplex coverage to complex coverage')
         #vcov.reference_coverage.append_reference_coverage(scov_pth)
         dir_path = vcov.reference_coverage.persistence_dir
         vcov.close()
         ccov = AbstractCoverage.load(dir_path, mode='a')
         ccov.append_reference_coverage(scov_pth)
         ccov.refresh()
         ccov.close()
     vcov.refresh()
     vcov.close()
        def create_known(dataset_name, rootgrp_name, grp_name):
            """
            A known array to compare against during tests
            """

            known_array = numpy.ones((10,20))

            filename = FileSystem.get_url(FS.TEMP,random_name(), ".hdf5")

            # Write an hdf file with known values to compare against
            h5pyfile = h5py.File(filename, mode = 'w', driver='core')
            grp = h5pyfile.create_group(rootgrp_name)
            subgrp = grp.create_group(grp_name)
            dataset = subgrp.create_dataset(dataset_name, known_array.shape, known_array.dtype.str, maxshape=(None,None))
            dataset.write_direct(known_array)
            h5pyfile.close()

            # convert the hdf file into a binary string
            f = open(filename, mode='rb')
            # read the binary string representation of the file
            known_hdf_as_string = f.read() # this is a known string to compare against during tests
            f.close()
            # cleaning up
            FileSystem.unlink(f.name)

            return known_array, known_hdf_as_string
def upload_qc():
    upload_folder = FileSystem.get_url(FS.TEMP, 'uploads')
    try:

        object_store = Container.instance.object_store

        # required fields
        upload = request.files['file']  # <input type=file name="file">

        if upload:

            # upload file - run filename through werkzeug.secure_filename
            filename = secure_filename(upload.filename)
            path = os.path.join(upload_folder, filename)
            upload_time = time.time()
            upload.save(path)
            filetype = _check_magic(
                upload) or 'CSV'  # Either going to be ZIP or CSV, probably

            # register upload
            file_upload_context = {
                'name': 'User uploaded QC file %s' % filename,
                'filename': filename,
                'filetype': filetype,  # only CSV, no detection necessary
                'path': path,
                'upload_time': upload_time,
                'status': 'File uploaded to server'
            }
            fuc_id, _ = object_store.create_doc(file_upload_context)

            # client to process dispatch
            pd_client = ProcessDispatcherServiceClient()

            # create process definition
            process_definition = ProcessDefinition(
                name='upload_qc_processor',
                executable={
                    'module': 'ion.processes.data.upload.upload_qc_processing',
                    'class': 'UploadQcProcessing'
                })
            process_definition_id = pd_client.create_process_definition(
                process_definition)
            # create process
            process_id = pd_client.create_process(process_definition_id)
            #schedule process
            config = DotDict()
            config.process.fuc_id = fuc_id
            pid = pd_client.schedule_process(process_definition_id,
                                             process_id=process_id,
                                             configuration=config)
            log.info('UploadQcProcessing process created %s' % pid)
            # response - only FileUploadContext ID and determined filetype for UX display
            resp = {'fuc_id': fuc_id}
            return gateway_json_response(resp)

        raise BadRequest('Invalid Upload')

    except Exception as e:
        return build_error_response(e)
 def _create_coverage(self, dataset_id, parameter_dict_id, time_dom, spatial_dom):
     pd = self.dataset_management_client.read_parameter_dictionary(parameter_dict_id)
     pdict = ParameterDictionary.load(pd)
     sdom = GridDomain.load(spatial_dom.dump())
     tdom = GridDomain.load(time_dom.dump())
     file_root = FileSystem.get_url(FS.CACHE,'datasets')
     scov = SimplexCoverage(file_root, dataset_id, dataset_id, parameter_dictionary=pdict, temporal_domain=tdom, spatial_domain=sdom)
     return scov
 def _create_view_coverage(self, dataset_id, description, parent_dataset_id):
     # As annoying as it is we need to load the view coverage belonging to parent dataset id and use the information
     # inside to build the new one...
     file_root = FileSystem.get_url(FS.CACHE,'datasets')
     pscov = self._get_simplex_coverage(parent_dataset_id, mode='r')
     scov_location = pscov.persistence_dir
     pscov.close()
     vcov = ViewCoverage(file_root, dataset_id, description or dataset_id, reference_coverage_location=scov_location)
     return vcov
 def _create_coverage(self, dataset_id, description, parameter_dict, spatial_domain,temporal_domain):
     file_root = FileSystem.get_url(FS.CACHE,'datasets')
     pdict = ParameterDictionary.load(parameter_dict)
     sdom = GridDomain.load(spatial_domain)
     tdom = GridDomain.load(temporal_domain)
     scov = self._create_simplex_coverage(dataset_id, pdict, sdom, tdom, self.inline_data_writes)
     vcov = ViewCoverage(file_root, dataset_id, description or dataset_id, reference_coverage_location=scov.persistence_dir)
     scov.close()
     return vcov
Example #12
0
    def process(self,packet):
        input = int(packet.get('num',0))
        prep = 'echo \'1+%d\' | bc' %(input)
        output = commands.getoutput(prep)
        if self.has_output:
            self.publish(dict(num=output))

        with open(FileSystem.get_url(FS.TEMP,"transform_output"),'a') as f:
            f.write('(%s): Received %s, transform: %s\n' %(self.name, packet, output))
def upload_qc():
    upload_folder = FileSystem.get_url(FS.TEMP,'uploads')
    try:

        object_store = Container.instance.object_store
        
        # required fields
        upload = request.files['file'] # <input type=file name="file">

        if upload:

            # upload file - run filename through werkzeug.secure_filename
            filename = secure_filename(upload.filename)
            path = os.path.join(upload_folder, filename)
            upload_time = time.time()
            upload.save(path)
            filetype = _check_magic(upload) or 'CSV' # Either going to be ZIP or CSV, probably

            # register upload
            file_upload_context = {
                'name': 'User uploaded QC file %s' % filename,
                'filename': filename,
                'filetype': filetype,  # only CSV, no detection necessary
                'path': path,
                'upload_time': upload_time,
                'status': 'File uploaded to server'
            }
            fuc_id, _ = object_store.create_doc(file_upload_context)

            # client to process dispatch
            pd_client = ProcessDispatcherServiceClient()

            # create process definition
            process_definition = ProcessDefinition(
                name='upload_qc_processor',
                executable={
                    'module': 'ion.processes.data.upload.upload_qc_processing',
                    'class': 'UploadQcProcessing'
                }
            )
            process_definition_id = pd_client.create_process_definition(process_definition)
            # create process
            process_id = pd_client.create_process(process_definition_id)
            # schedule process
            config = DotDict()
            config.process.fuc_id = fuc_id
            pid = pd_client.schedule_process(process_definition_id, process_id=process_id, configuration=config)
            log.info('UploadQcProcessing process created %s' % pid)
            # response - only FileUploadContext ID and determined filetype for UX display
            resp = {'fuc_id': fuc_id}
            return gateway_json_response(resp)

        raise BadRequest('Invalid Upload')

    except Exception as e:
        return build_error_response(e)
Example #14
0
 def _create_complex_coverage(cls, dataset_id, description, parameter_dict):
     pdict = ParameterDictionary.load(parameter_dict)
     file_root = FileSystem.get_url(FS.CACHE, 'datasets')
     ccov = ComplexCoverage(
         file_root,
         dataset_id,
         'Complex Coverage for %s' % dataset_id,
         parameter_dictionary=pdict,
         complex_type=ComplexCoverageType.TEMPORAL_AGGREGATION)
     return ccov
Example #15
0
 def _create_simplex_coverage(cls, dataset_id, parameter_dictionary,
                              spatial_domain, temporal_domain):
     file_root = FileSystem.get_url(FS.CACHE, 'datasets')
     scov = SimplexCoverage(file_root,
                            dataset_id,
                            'Simplex Coverage for %s' % dataset_id,
                            parameter_dictionary=parameter_dictionary,
                            temporal_domain=temporal_domain,
                            spatial_domain=spatial_domain)
     return scov
    def process(self, packet):
        input = int(packet.get('num', 0))
        prep = 'echo \'1+%d\' | bc' % (input)
        output = commands.getoutput(prep)
        if self.has_output:
            self.publish(dict(num=output))

        with open(FileSystem.get_url(FS.TEMP, "transform_output"), 'a') as f:
            f.write('(%s): Received %s, transform: %s\n' %
                    (self.name, packet, output))
 def check_msg(msg, header):
     assertions(isinstance(msg, StreamGranuleContainer), 'Msg is not a container')
     hdf_string = msg.identifiables[msg.data_stream_id].values
     sha1 = hashlib.sha1(hdf_string).hexdigest().upper()
     log.debug('Sha1 matches')
     log.debug('Dumping file so you can inspect it.')
     log.debug('Records: %d' % msg.identifiables['record_count'].value)
     with open(FileSystem.get_url(FS.TEMP,'%s.cap.hdf5' % sha1[:8]),'w') as f:
         f.write(hdf_string)
         log.debug('Stream Capture: %s', f.name)
     result.set(True)
Example #18
0
    def process(self, packet):
        """Processes incoming data!!!!
        """
        output = int(packet.get('num',0)) + 1
        log.debug('(%s) Processing Packet: %s',self.name,packet)
        log.debug('(%s) Transform Complete: %s', self.name, output)

        if self.has_output:
            self.publish(dict(num=str(output)))

        with open(FileSystem.get_url(FS.TEMP,"transform_output"),'a') as f:
            f.write('(%s): Received Packet: %s\n' % (self.name,packet))
            f.write('(%s):   - Transform - %d\n' % (self.name,output))
 def _create_view_coverage(self, dataset_id, description,
                           parent_dataset_id):
     # As annoying as it is we need to load the view coverage belonging to parent dataset id and use the information
     # inside to build the new one...
     file_root = FileSystem.get_url(FS.CACHE, 'datasets')
     pscov = self._get_simplex_coverage(parent_dataset_id, mode='r')
     scov_location = pscov.persistence_dir
     pscov.close()
     vcov = ViewCoverage(file_root,
                         dataset_id,
                         description or dataset_id,
                         reference_coverage_location=scov_location)
     return vcov
    def process(self, packet):
        """Processes incoming data!!!!
        """
        output = int(packet.get('num', 0)) + 1
        log.debug('(%s) Processing Packet: %s', self.name, packet)
        log.debug('(%s) Transform Complete: %s', self.name, output)

        if self.has_output:
            self.publish(dict(num=str(output)))

        with open(FileSystem.get_url(FS.TEMP, "transform_output"), 'a') as f:
            f.write('(%s): Received Packet: %s\n' % (self.name, packet))
            f.write('(%s):   - Transform - %d\n' % (self.name, output))
Example #21
0
 def check_msg(msg, header):
     assertions(isinstance(msg, StreamGranuleContainer),
                'Msg is not a container')
     hdf_string = msg.identifiables[msg.data_stream_id].values
     sha1 = hashlib.sha1(hdf_string).hexdigest().upper()
     log.debug('Sha1 matches')
     log.debug('Dumping file so you can inspect it.')
     log.debug('Records: %d' % msg.identifiables['record_count'].value)
     with open(FileSystem.get_url(FS.TEMP, '%s.cap.hdf5' % sha1[:8]),
               'w') as f:
         f.write(hdf_string)
         log.debug('Stream Capture: %s', f.name)
     result.set(True)
Example #22
0
 def _create_coverage(self, dataset_id, description, parameter_dict,
                      spatial_domain, temporal_domain):
     pdict = ParameterDictionary.load(parameter_dict)
     sdom = GridDomain.load(spatial_domain)
     tdom = GridDomain.load(temporal_domain)
     file_root = FileSystem.get_url(FS.CACHE, 'datasets')
     scov = SimplexCoverage(file_root,
                            dataset_id,
                            description or dataset_id,
                            parameter_dictionary=pdict,
                            temporal_domain=tdom,
                            spatial_domain=sdom,
                            inline_data_writes=self.inline_data_writes)
     return scov
 def _create_coverage(self, dataset_id, description, parameter_dict,
                      spatial_domain, temporal_domain):
     file_root = FileSystem.get_url(FS.CACHE, 'datasets')
     pdict = ParameterDictionary.load(parameter_dict)
     sdom = GridDomain.load(spatial_domain)
     tdom = GridDomain.load(temporal_domain)
     scov = self._create_simplex_coverage(dataset_id, pdict, sdom, tdom,
                                          self.inline_data_writes)
     vcov = ViewCoverage(file_root,
                         dataset_id,
                         description or dataset_id,
                         reference_coverage_location=scov.persistence_dir)
     scov.close()
     return vcov
    def setUp(self):

        import numpy, h5py

        FileSystem(DotDict())

        #--------------------------------------------------------------------
        # Create an hdf file for testing
        #--------------------------------------------------------------------

        self.salinity = [0,]*3
        self.temperature = [0,]*3
        self.pressure = [0,]*3

        self.salinity[0] = numpy.arange(50)
        self.salinity[1] = numpy.arange(50) + 50
        self.salinity[2] = numpy.arange(50) + 100

        self.temperature[0] = numpy.random.normal(size=50)
        self.temperature[1] = numpy.random.normal(size=50)
        self.temperature[2] = numpy.random.normal(size=50)

        self.pressure[0] = numpy.random.uniform(low=0.0, high=1.0, size=50)
        self.pressure[1] = numpy.random.uniform(low=0.0, high=1.0, size=50)
        self.pressure[2] = numpy.random.uniform(low=0.0, high=1.0, size=50)

        # provide the check_pieces mathod the size of the dataset so that it can do its checking..
        self.sl = slice(0,150)

        self.fnames = [0,]*3
        for i in range(0,3):
            self.fnames[i] = FileSystem.get_url(FS.TEMP, 'data%d.hdf5' % (i+1))

        for fname, s, t, p in zip(self.fnames, self.salinity, self.temperature, self.pressure):
            file = h5py.File(fname, 'w')

            grp1 = file.create_group('fields')
            dset1 = grp1.create_dataset("salinity", data=s)
            dset2 = grp1.create_dataset("temperature", data=t)
            dset3 = grp1.create_dataset("pressure", data=p)

            file.close()


        # Concatenate the test values for comparison:

        self.t_result = numpy.concatenate((self.temperature[0],self.temperature[1],self.temperature[2]), axis = 0)
        self.s_result = numpy.concatenate((self.salinity[0],self.salinity[1],self.salinity[2]), axis = 0)
        self.p_result = numpy.concatenate((self.pressure[0],self.pressure[1],self.pressure[2]), axis = 0)
Example #25
0
    def __init__(self, hdf_string):
        """
        @param hdf_string
        """
        #try:
        assert isinstance(hdf_string, basestring), 'The input for instantiating the HDFDecoder object is not a string'
        #except AssertionError as err:
        #    raise HDFDecoderException(err.message)

        self.filename = FileSystem.get_url(fs=FS.TEMP, filename=hashlib.sha1(hdf_string).hexdigest(), ext='_decoder.hdf5')
        #try:
        # save an hdf string to disk - in /tmp to so we can open it as an hdf file and read data from it
        f = open(self.filename, mode='wb')
        f.write(hdf_string)
        f.close()
 def _create_simplex_coverage(cls,
                              dataset_id,
                              parameter_dictionary,
                              spatial_domain,
                              temporal_domain,
                              inline_data_writes=True):
     file_root = FileSystem.get_url(FS.CACHE, 'datasets')
     scov = SimplexCoverage(file_root,
                            uuid4().hex,
                            'Simplex Coverage for %s' % dataset_id,
                            parameter_dictionary=parameter_dictionary,
                            temporal_domain=temporal_domain,
                            spatial_domain=spatial_domain,
                            inline_data_writes=inline_data_writes)
     return scov
 def _create_coverage(self, dataset_id, parameter_dict_id, time_dom,
                      spatial_dom):
     pd = self.dataset_management_client.read_parameter_dictionary(
         parameter_dict_id)
     pdict = ParameterDictionary.load(pd)
     sdom = GridDomain.load(spatial_dom.dump())
     tdom = GridDomain.load(time_dom.dump())
     file_root = FileSystem.get_url(FS.CACHE, 'datasets')
     scov = SimplexCoverage(file_root,
                            dataset_id,
                            dataset_id,
                            parameter_dictionary=pdict,
                            temporal_domain=tdom,
                            spatial_domain=sdom)
     return scov
Example #28
0
    def __init__(self, name = None):
        """
        @param name The name of the dataset
        """
        # generate a random name for the filename if it has not been provided.
        self.filename = FileSystem.get_url(fs=FS.TEMP, filename=name or random_name(), ext='encoder.hdf5')

        # Using inline imports to put off making hdf/numpy required dependencies
        import h5py

        # open an hdf file on disk - in /tmp to write data to since we can't yet do in memory
        log.debug("Creating h5py file object for the encoder at %s" % self.filename)
        if os.path.isfile(self.filename):
            # if file exists, then append to it
            self.h5pyfile = h5py.File(self.filename, mode = 'r+', driver='core')
        else:
            # if file does not already exist, write a new one
            self.h5pyfile = h5py.File(self.filename, mode = 'w', driver='core')
        assert self.h5pyfile, 'No h5py file object created.'
def upload_data(dataproduct_id):
    upload_folder = FileSystem.get_url(FS.TEMP,'uploads')
    try:

        rr_client = ResourceRegistryServiceProcessClient(process=service_gateway_instance)
        object_store = Container.instance.object_store

        try:
            rr_client.read(str(dataproduct_id))
        except BadRequest:
            raise BadRequest('Unknown DataProduct ID %s' % dataproduct_id)

        # required fields
        upload = request.files['file']  # <input type=file name="file">

        # determine filetype
        filetype = _check_magic(upload)
        upload.seek(0)  # return to beginning for save

        if upload and filetype is not None:

            # upload file - run filename through werkzeug.secure_filename
            filename = secure_filename(upload.filename)
            path = os.path.join(upload_folder, filename)
            upload_time = time.time()
            upload.save(path)

            # register upload
            file_upload_context = {
                # TODO add dataproduct_id
                'name':'User uploaded file %s' % filename,
                'filename':filename,
                'filetype':filetype,
                'path':path,
                'upload_time':upload_time,
                'status':'File uploaded to server'
            }
            fuc_id, _ = object_store.create_doc(file_upload_context)

            # client to process dispatch
            pd_client = ProcessDispatcherServiceClient()

            # create process definition
            process_definition = ProcessDefinition(
                name='upload_data_processor',
                executable={
                    'module':'ion.processes.data.upload.upload_data_processing',
                    'class':'UploadDataProcessing'
                }
            )
            process_definition_id = pd_client.create_process_definition(process_definition)
            # create process
            process_id = pd_client.create_process(process_definition_id)
            #schedule process
            config = DotDict()
            config.process.fuc_id = fuc_id
            config.process.dp_id = dataproduct_id
            pid = pd_client.schedule_process(process_definition_id, process_id=process_id, configuration=config)
            log.info('UploadDataProcessing process created %s' % pid)
            # response - only FileUploadContext ID and determined filetype for UX display
            resp = {'fuc_id': fuc_id}
            return gateway_json_response(resp)

        raise BadRequest('Invalid Upload')

    except Exception as e:
        return build_error_response(e)
 def _get_coverage(cls, dataset_id, mode='w'):
     file_root = FileSystem.get_url(FS.CACHE, 'datasets')
     coverage = AbstractCoverage.load(file_root, dataset_id, mode=mode)
     return coverage
 def _get_coverage_path(cls, dataset_id):
     file_root = FileSystem.get_url(FS.CACHE, 'datasets')
     return os.path.join(file_root, '%s' % dataset_id)
Example #32
0
    def _publish_query(self, results):
        '''
        Callback to publish the specified results
        '''
        #-----------------------
        # Iteration
        #-----------------------
        #  - Go through the results, if the user had include_docs=True in the options field
        #    then the full document is in result.doc; however if the query did not include_docs,
        #    then only the doc_id is provided in the result.value.
        #
        #  - What this allows us to do is limit the amount of traffic in information for large queries.
        #    If we only are making a query in a sequence of queries (such as map and reduce) then we don't
        #    care about the full document, yet, we only care about the doc id and will retrieve the document later.
        #  - Example:
        #      Imagine the blogging example, we want the latest blog by author George and all the comments for that blog
        #      The series of queries would go, post_by_updated -> posts_by_author -> posts_join_comments and then
        #      in the last query we'll set include_docs to true and parse the docs.
        #-----------------------


        log.warn('results: %s', results)

        for result in results:
            log.warn('REPLAY Result: %s' % result)



            assert('doc' in result)

            replay_obj_msg = result['doc']

            if isinstance(replay_obj_msg, BlogBase):
                replay_obj_msg.is_replay = True

                self.lock.acquire()
                self.output.publish(replay_obj_msg)
                self.lock.release()

            elif isinstance(replay_obj_msg, StreamDefinitionContainer):

                replay_obj_msg.stream_resource_id = self.stream_id


            elif isinstance(replay_obj_msg, StreamGranuleContainer):

                # Override the resource_stream_id so ingestion doesn't reingest, also this is a NEW stream (replay)
                replay_obj_msg.stream_resource_id = self.stream_id

                datastream = None
                sha1 = None

                for key, identifiable in replay_obj_msg.identifiables.iteritems():
                    if isinstance(identifiable, DataStream):
                        datastream = identifiable
                    elif isinstance(identifiable, Encoding):
                        sha1 = identifiable.sha1

                if sha1: # if there is an encoding

                    # Get the file from disk
                    filename = FileSystem.get_url(FS.CACHE, sha1, ".hdf5")

                    log.warn('Replay reading from filename: %s' % filename)

                    hdf_string = ''
                    try:
                        with open(filename, mode='rb') as f:
                            hdf_string = f.read()
                            f.close()

                            # Check the Sha1
                            retreived_hdfstring_sha1 = hashlib.sha1(hdf_string).hexdigest().upper()

                            if sha1 != retreived_hdfstring_sha1:
                                raise  ReplayProcessException('The sha1 mismatch between the sha1 in datastream and the sha1 of hdf_string in the saved file in hdf storage')

                    except IOError:
                        log.warn('No HDF file found!')
                        #@todo deal with this situation? How?
                        hdf_string = 'HDF File %s not found!' % filename

                    # set the datastream.value field!
                    datastream.values = hdf_string

                else:
                    log.warn('No encoding in the StreamGranuleContainer!')

                self.lock.acquire()
                self.output.publish(replay_obj_msg)
                self.lock.release()


            else:
                 log.warn('Unknown type retrieved in DOC!')



        #@todo: log when there are not results
        if results is None:
            log.warn('No results found in replay query!')
        else:
            log.debug('Published replay!')
Example #33
0
 def start(self):
     if self.container.has_capability(self.container.CCAP.FILE_SYSTEM):
         self.datastore_dir = FileSystem.get_url(FS.FILESTORE,
                                                 self.datastore_name)
     else:
         self.datastore_dir = "./tmp/%s" % self.datastore_name
def upload_data(dataproduct_id):
    upload_folder = FileSystem.get_url(FS.TEMP, 'uploads')
    try:

        rr_client = ResourceRegistryServiceProcessClient(
            node=Container.instance.node, process=service_gateway_instance)
        object_store = Container.instance.object_store

        try:
            rr_client.read(str(dataproduct_id))
        except BadRequest:
            raise BadRequest('Unknown DataProduct ID %s' % dataproduct_id)

        # required fields
        upload = request.files['file']  # <input type=file name="file">

        # determine filetype
        filetype = _check_magic(upload)
        upload.seek(0)  # return to beginning for save

        if upload and filetype is not None:

            # upload file - run filename through werkzeug.secure_filename
            filename = secure_filename(upload.filename)
            path = os.path.join(upload_folder, filename)
            upload_time = time.time()
            upload.save(path)

            # register upload
            file_upload_context = {
                # TODO add dataproduct_id
                'name': 'User uploaded file %s' % filename,
                'filename': filename,
                'filetype': filetype,
                'path': path,
                'upload_time': upload_time,
                'status': 'File uploaded to server'
            }
            fuc_id, _ = object_store.create_doc(file_upload_context)

            # client to process dispatch
            pd_client = ProcessDispatcherServiceClient()

            # create process definition
            process_definition = ProcessDefinition(
                name='upload_data_processor',
                executable={
                    'module':
                    'ion.processes.data.upload.upload_data_processing',
                    'class': 'UploadDataProcessing'
                })
            process_definition_id = pd_client.create_process_definition(
                process_definition)
            # create process
            process_id = pd_client.create_process(process_definition_id)
            #schedule process
            config = DotDict()
            config.process.fuc_id = fuc_id
            config.process.dp_id = dataproduct_id
            pid = pd_client.schedule_process(process_definition_id,
                                             process_id=process_id,
                                             configuration=config)
            log.info('UploadDataProcessing process created %s' % pid)
            # response - only FileUploadContext ID and determined filetype for UX display
            resp = {'fuc_id': fuc_id}
            return gateway_json_response(resp)

        raise BadRequest('Invalid Upload')

    except Exception as e:
        return build_error_response(e)
Example #35
0
 def start(self):
     if self.container.has_capability(self.container.CCAP.FILE_SYSTEM):
         self.datastore_dir = FileSystem.get_url(FS.FILESTORE, self.datastore_name)
     else:
         self.datastore_dir = "./tmp/%s" % self.datastore_name
Example #36
0
    def process_stream(self, packet, dset_config):
        """
        Accepts a stream. Also accepts instruction (a dset_config). According to the received dset_config it processes the
        stream such as store in hfd_storage, couch_storage.
        @param: packet The incoming data stream of type stream.
        @param: dset_config The dset_config telling this method what to do with the incoming data stream.
        """

        # Ignoring is_replay attribute now that we have a policy implementation
        if isinstance(packet, StreamGranuleContainer):

            if dset_config is None:
                log.warn('No dataset config for this stream!')
                return



            hdfstring = ''
            sha1 = ''

            for key,value in packet.identifiables.iteritems():
                if isinstance(value, DataStream):
                    hdfstring = value.values
                    value.values=''

                elif isinstance(value, Encoding):
                    sha1 = value.sha1



            if dset_config.archive_metadata is True:
                log.debug("Persisting data....")
                self.persist_immutable(packet )

            if dset_config.archive_data is True:
                #@todo - grab the filepath to save the hdf string somewhere..

                if hdfstring:

                    calculated_sha1 = hashlib.sha1(hdfstring).hexdigest().upper()

                    filename = FileSystem.get_url(FS.CACHE, calculated_sha1, ".hdf5")

                    if sha1 != calculated_sha1:
                        raise  IngestionWorkerException('The sha1 stored is different than the calculated from the received hdf_string')

                    log.warn('writing to filename: %s' % filename)

                    with open(filename, mode='wb') as f:
                        f.write(hdfstring)
                        f.close()
                else:
                    log.warn("Nothing to write!")


        elif isinstance(packet, BlogPost) and not packet.is_replay:
            self.persist_immutable(packet )


        elif isinstance(packet, BlogComment) and not packet.is_replay:
            self.persist_immutable(packet)
 def _create_simplex_coverage(cls, dataset_id, parameter_dictionary, spatial_domain, temporal_domain):
     file_root = FileSystem.get_url(FS.CACHE,'datasets')
     scov = SimplexCoverage(file_root,dataset_id,'Simplex Coverage for %s' % dataset_id, parameter_dictionary=parameter_dictionary, temporal_domain=temporal_domain, spatial_domain=spatial_domain )
     return scov
 def _get_coverage_path(cls, dataset_id):
     file_root = FileSystem.get_url(FS.CACHE,'datasets')
     return os.path.join(file_root, '%s' % dataset_id)
Example #39
0
 def _get_coverage(cls, dataset_id, mode='w'):
     file_root = FileSystem.get_url(FS.CACHE, 'datasets')
     coverage = SimplexCoverage(file_root, dataset_id, mode=mode)
     return coverage
 def _get_coverage(cls,dataset_id,mode='r'):
     file_root = FileSystem.get_url(FS.CACHE,'datasets')
     coverage = AbstractCoverage.load(file_root, dataset_id, mode=mode)
     return coverage
Example #41
0
    def on_start(self):
        super(TransformCapture, self).on_start()
#        #@todo: Remove debugging statements
        log.debug('(Transform: %s) Starting...',self.name)
        self.file_name = self.CFG.get_safe('process.file_name',FileSystem.get_url(FS.TEMP,'transform_output'))
 def _create_simplex_coverage(cls, dataset_id, parameter_dictionary, spatial_domain, temporal_domain, inline_data_writes=True):
     file_root = FileSystem.get_url(FS.CACHE,'datasets')
     scov = SimplexCoverage(file_root,uuid4().hex,'Simplex Coverage for %s' % dataset_id, parameter_dictionary=parameter_dictionary, temporal_domain=temporal_domain, spatial_domain=spatial_domain, inline_data_writes=inline_data_writes)
     return scov
Example #43
0
    def setUp(self):

        import numpy, h5py

        FileSystem(DotDict())

        #--------------------------------------------------------------------
        # Create an hdf file for testing
        #--------------------------------------------------------------------

        self.salinity = [
            0,
        ] * 3
        self.temperature = [
            0,
        ] * 3
        self.pressure = [
            0,
        ] * 3

        self.salinity[0] = numpy.arange(50)
        self.salinity[1] = numpy.arange(50) + 50
        self.salinity[2] = numpy.arange(50) + 100

        self.temperature[0] = numpy.random.normal(size=50)
        self.temperature[1] = numpy.random.normal(size=50)
        self.temperature[2] = numpy.random.normal(size=50)

        self.pressure[0] = numpy.random.uniform(low=0.0, high=1.0, size=50)
        self.pressure[1] = numpy.random.uniform(low=0.0, high=1.0, size=50)
        self.pressure[2] = numpy.random.uniform(low=0.0, high=1.0, size=50)

        # provide the check_pieces mathod the size of the dataset so that it can do its checking..
        self.sl = slice(0, 150)

        self.fnames = [
            0,
        ] * 3
        for i in range(0, 3):
            self.fnames[i] = FileSystem.get_url(FS.TEMP,
                                                'data%d.hdf5' % (i + 1))

        for fname, s, t, p in zip(self.fnames, self.salinity, self.temperature,
                                  self.pressure):
            file = h5py.File(fname, 'w')

            grp1 = file.create_group('fields')
            dset1 = grp1.create_dataset("salinity", data=s)
            dset2 = grp1.create_dataset("temperature", data=t)
            dset3 = grp1.create_dataset("pressure", data=p)

            file.close()

        # Concatenate the test values for comparison:

        self.t_result = numpy.concatenate(
            (self.temperature[0], self.temperature[1], self.temperature[2]),
            axis=0)
        self.s_result = numpy.concatenate(
            (self.salinity[0], self.salinity[1], self.salinity[2]), axis=0)
        self.p_result = numpy.concatenate(
            (self.pressure[0], self.pressure[1], self.pressure[2]), axis=0)
 def _create_complex_coverage(cls, dataset_id, description, parameter_dict):
     pdict = ParameterDictionary.load(parameter_dict)
     file_root = FileSystem.get_url(FS.CACHE, 'datasets')
     ccov = ComplexCoverage(file_root, dataset_id, 'Complex Coverage for %s' % dataset_id, parameter_dictionary=pdict, complex_type=ComplexCoverageType.TEMPORAL_AGGREGATION)
     return ccov
Example #45
0
    def test_dm_integration(self):
        '''
        test_dm_integration
        Test full DM Services Integration
        '''
        cc = self.container
        assertions = self.assertTrue

        #-----------------------------
        # Copy below here
        #-----------------------------
        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(node=cc.node)
        data_retriever_service = DataRetrieverServiceClient(node=cc.node)
        transform_management_service = TransformManagementServiceClient(node=cc.node)
        process_dispatcher = ProcessDispatcherServiceClient(node=cc.node)

        process_list = []
        datasets = []

        datastore_name = 'test_dm_integration'


        #---------------------------
        # Set up ingestion
        #---------------------------
        # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile
        log.debug('Calling create_ingestion_configuration')
        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id='science_data',
            couch_storage=CouchStorage(datastore_name=datastore_name,datastore_profile='SCIDATA'),
            number_of_workers=8
        )
        #
        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id)

        ctd_stream_def = ctd_stream_definition()

        stream_def_id = pubsub_management_service.create_stream_definition(container=ctd_stream_def, name='Junk definition')


        #---------------------------
        # Set up the producers (CTD Simulators)
        #---------------------------
        # Launch five simulated CTD producers
        for iteration in xrange(5):
            # Make a stream to output on

            stream_id = pubsub_management_service.create_stream(stream_definition_id=stream_def_id)

            #---------------------------
            # Set up the datasets
            #---------------------------
            dataset_id = dataset_management_service.create_dataset(
                stream_id=stream_id,
                datastore_name=datastore_name,
                view_name='datasets/stream_join_granule'
            )
            # Keep track of the datasets
            datasets.append(dataset_id)

            stream_policy_id = ingestion_management_service.create_dataset_configuration(
                dataset_id = dataset_id,
                archive_data = True,
                archive_metadata = True,
                ingestion_configuration_id = ingestion_configuration_id
            )


            producer_definition = ProcessDefinition()
            producer_definition.executable = {
                'module':'ion.processes.data.ctd_stream_publisher',
                'class':'SimpleCtdPublisher'
            }
            configuration = {
                'process':{
                    'stream_id':stream_id,
                    'datastore_name':datastore_name
                }
            }
            procdef_id = process_dispatcher.create_process_definition(process_definition=producer_definition)
            log.debug('LUKE_DEBUG: procdef_id: %s', procdef_id)
            pid = process_dispatcher.schedule_process(process_definition_id=procdef_id, configuration=configuration)


            # Keep track, we'll kill 'em later.
            process_list.append(pid)
        # Get about 4 seconds of data
        time.sleep(4)

        #---------------------------
        # Stop producing data
        #---------------------------

        for process in process_list:
            process_dispatcher.cancel_process(process)

        #----------------------------------------------
        # The replay and the transform, a love story.
        #----------------------------------------------
        # Happy Valentines to the clever coder who catches the above!

        transform_definition = ProcessDefinition()
        transform_definition.executable = {
            'module':'ion.processes.data.transforms.transform_example',
            'class':'TransformCapture'
        }
        transform_definition_id = process_dispatcher.create_process_definition(process_definition=transform_definition)

        dataset_id = datasets.pop() # Just need one for now
        replay_id, stream_id = data_retriever_service.define_replay(dataset_id=dataset_id)

        #--------------------------------------------
        # I'm Selling magazine subscriptions here!
        #--------------------------------------------

        subscription = pubsub_management_service.create_subscription(query=StreamQuery(stream_ids=[stream_id]),
            exchange_name='transform_capture_point')

        #--------------------------------------------
        # Start the transform (capture)
        #--------------------------------------------
        transform_id = transform_management_service.create_transform(
            name='capture_transform',
            in_subscription_id=subscription,
            process_definition_id=transform_definition_id
        )

        transform_management_service.activate_transform(transform_id=transform_id)

        #--------------------------------------------
        # BEGIN REPLAY!
        #--------------------------------------------

        data_retriever_service.start_replay(replay_id=replay_id)

        #--------------------------------------------
        # Lets get some boundaries
        #--------------------------------------------

        bounds = dataset_management_service.get_dataset_bounds(dataset_id=dataset_id)
        assertions('latitude_bounds' in bounds, 'dataset_id: %s' % dataset_id)
        assertions('longitude_bounds' in bounds)
        assertions('pressure_bounds' in bounds)

        #--------------------------------------------
        # Make sure the transform capture worked
        #--------------------------------------------

        time.sleep(3) # Give the other processes up to 3 seconds to catch up


        stats = os.stat(FileSystem.get_url(FS.TEMP,'transform_output'))
        assertions(stats.st_blksize > 0)

        # BEAUTIFUL!

        FileSystem.unlink(FileSystem.get_url(FS.TEMP,'transform_output'))
 def _get_coverage(cls,dataset_id,mode='w'):
     file_root = FileSystem.get_url(FS.CACHE,'datasets')
     coverage = SimplexCoverage(file_root, dataset_id,mode=mode)
     return coverage