Ejemplo n.º 1
0
    def make_some_data(self):
        import numpy as np

        stream_id = 'I am very special'
        definition = SBE37_CDM_stream_definition()
        definition.stream_resource_id = stream_id

        self.couch.create(definition)

        total = 200
        n = 10 # at most n records per granule
        i = 0

        while i < total:
            r = random.randint(1,n)

            psc = PointSupplementConstructor(point_definition=definition, stream_id=stream_id)
            for x in xrange(r):
                i+=1
                point_id = psc.add_point(time=i, location=(0,0,0))
                psc.add_scalar_point_coverage(point_id=point_id, coverage_id='temperature', value=np.random.normal(loc=48.0,scale=4.0, size=1)[0])
                psc.add_scalar_point_coverage(point_id=point_id, coverage_id='pressure', value=np.float32(1.0))
                psc.add_scalar_point_coverage(point_id=point_id, coverage_id='conductivity', value=np.float32(2.0))
            granule = psc.close_stream_granule()
            hdf_string = granule.identifiables[definition.data_stream_id].values
            sha1 = hashlib.sha1(hdf_string).hexdigest().upper()
            with open(FileSystem.get_hierarchical_url(FS.CACHE, '%s.hdf5' % sha1),'w') as f:
                f.write(hdf_string)
            granule.identifiables[definition.data_stream_id].values = ''
            self.couch.create(granule)
Ejemplo n.º 2
0
 def read_persisted_cache(self, sha1, encoding):
     byte_string = None
     path = FileSystem.get_hierarchical_url(FS.CACHE,sha1,'.%s' % encoding)
     try:
         with open(path, 'r') as f:
             byte_string = f.read()
     except IOError as e:
         raise BadRequest(e.message)
     return byte_string
Ejemplo n.º 3
0
    def process_stream(self, packet, dset_config):
        """
        Accepts a stream. Also accepts instruction (a dset_config). According to the received dset_config it processes the
        stream such as store in hfd_storage, couch_storage.
        @param: packet The incoming data stream of type stream.
        @param: dset_config The dset_config telling this method what to do with the incoming data stream.
        """


        ingestion_attributes={'variables':[], 'number_of_records':-1,'updated_metadata':False, 'updated_data':False}

        if dset_config is None:
            log.info('No dataset config for this stream!')
            return


        # Get back to the serialized form - the process receives only the IonObject after the interceptor stack has decoded it...
        simple_dict = ion_serializer.serialize(packet) #packet is an ion_object
        byte_string = msgpack.packb(simple_dict, default=encode_ion)

        encoding_type = 'ion_msgpack'

        # Persisted sha1 is crafted from the byte string msgpack creates
        calculated_sha1 = hashlib.sha1(byte_string).hexdigest().upper()

        dataset_granule = {
            'stream_id'      : dset_config.stream_id,
            'dataset_id'     : dset_config.dataset_id,
            'persisted_sha1' : calculated_sha1,
            'encoding_type'  : encoding_type,
            'ts_create'      : get_ion_ts()
        }


        self.persist_immutable(dataset_granule)





        filename = FileSystem.get_hierarchical_url(FS.CACHE, calculated_sha1, ".%s" % encoding_type)

        with open(filename, mode='wb') as f:
            f.write(byte_string)
            f.close()


        return ingestion_attributes
    def persist_file(self, file_data='', digest='', metadata=None):
        ds = self.container.datastore_manager.get_datastore(
            self.datastore_name, DS.DS_PROFILE.FILESYSTEM)
        validate_is_instance(file_data, basestring,
                             "File or binary data must be a string.")
        validate_is_instance(metadata, File)

        if self.list_files(metadata.name + metadata.extension):
            raise BadRequest('%s already exists.' % metadata.name +
                             metadata.extension)

        digest_ = sha224(file_data).hexdigest()
        if digest:
            validate_equal(
                digest, digest_,
                "The provided digest does not match the file's digest. Ensure you are using sha224."
            )
        else:
            digest = digest_

        extension = metadata.extension
        if '.' in metadata.name:
            t = metadata.name.split('.')
            metadata.name, metadata.extension = ('.'.join(t[:-1]), '.' + t[-1])
        url = FileSystem.get_hierarchical_url(FS.CACHE, digest, extension)
        try:
            with open(url, 'w+b') as f:
                f.write(file_data)
                f.close()
        except Exception:
            log.exception('Failed to write %s', url)
            raise BadRequest('Could not successfully write file data')
        if metadata.name[0] != '/':
            metadata.name = '/' + metadata.name
        metadata.url = url
        metadata.digest = digest
        metadata.created_date = IonTime().to_string()
        metadata.modified_date = IonTime().to_string()
        metadata.size = len(file_data)

        doc_id, rev_id = ds.create(metadata)
        return doc_id
Ejemplo n.º 5
0
    def make_some_data(self):
        import numpy as np

        stream_id = 'I am very special'
        definition = SBE37_CDM_stream_definition()
        definition.stream_resource_id = stream_id

        self.couch.create(definition)

        total = 200
        n = 10  # at most n records per granule
        i = 0

        while i < total:
            r = random.randint(1, n)

            psc = PointSupplementConstructor(point_definition=definition,
                                             stream_id=stream_id)
            for x in xrange(r):
                i += 1
                point_id = psc.add_point(time=i, location=(0, 0, 0))
                psc.add_scalar_point_coverage(
                    point_id=point_id,
                    coverage_id='temperature',
                    value=np.random.normal(loc=48.0, scale=4.0, size=1)[0])
                psc.add_scalar_point_coverage(point_id=point_id,
                                              coverage_id='pressure',
                                              value=np.float32(1.0))
                psc.add_scalar_point_coverage(point_id=point_id,
                                              coverage_id='conductivity',
                                              value=np.float32(2.0))
            granule = psc.close_stream_granule()
            hdf_string = granule.identifiables[
                definition.data_stream_id].values
            sha1 = hashlib.sha1(hdf_string).hexdigest().upper()
            with open(
                    FileSystem.get_hierarchical_url(FS.CACHE,
                                                    '%s.hdf5' % sha1),
                    'w') as f:
                f.write(hdf_string)
            granule.identifiables[definition.data_stream_id].values = ''
            self.couch.create(granule)
Ejemplo n.º 6
0
    def _parse_granule(self, granule):
        '''
        @brief Ensures the granule is valid and gets some metadata from the granule for building the dataset
        @param granule raw granule straight from couch
        @return metadata in the granule as well as the granule itself if valid.
        '''

        granule.stream_resource_id = self.stream_id

        element_count_id = self.element_count_id
        encoding_id = self.encoding_id

        record_count = granule.identifiables[element_count_id].value
        sha1 = granule.identifiables[encoding_id].sha1 or None

        # If there are no records then this is not a proper granule
        if not (record_count > 0):
            log.debug('Granule had no record count discarding.')
            return None

        # No encoding, no packet
        if not encoding_id in granule.identifiables:
            log.debug('Granule had no encoding discarding.')
            return None

        if not sha1:
            log.debug('Granule had no sha1')
            return None


        filepath = FileSystem.get_hierarchical_url(FS.CACHE, sha1, '.hdf5')

        if not os.path.exists(filepath):
            log.debug('File with sha1 does not exist')
            return None

        return {
            'granule':granule,
            'records':record_count,
            'sha1':sha1
        }
Ejemplo n.º 7
0
    def _parse_granule(self, granule):
        '''
        @brief Ensures the granule is valid and gets some metadata from the granule for building the dataset
        @param granule raw granule straight from couch
        @return metadata in the granule as well as the granule itself if valid.
        '''

        granule.stream_resource_id = self.stream_id

        element_count_id = self.element_count_id
        encoding_id = self.encoding_id

        record_count = granule.identifiables[element_count_id].value
        sha1 = granule.identifiables[encoding_id].sha1 or None

        # If there are no records then this is not a proper granule
        if not (record_count > 0):
            log.debug('Granule had no record count discarding.')
            return None

        # No encoding, no packet
        if not encoding_id in granule.identifiables:
            log.debug('Granule had no encoding discarding.')
            return None

        if not sha1:
            log.debug('Granule had no sha1')
            return None

        filepath = FileSystem.get_hierarchical_url(FS.CACHE, sha1, '.hdf5')

        if not os.path.exists(filepath):
            log.debug('File with sha1 does not exist')
            return None

        return {'granule': granule, 'records': record_count, 'sha1': sha1}
    def persist_file(self, file_data='', digest='', metadata=None):
        ds = self.container.datastore_manager.get_datastore(self.datastore_name, DS.DS_PROFILE.FILESYSTEM)
        validate_is_instance(file_data,basestring, "File or binary data must be a string.")
        validate_is_instance(metadata,File)

        if self.list_files(metadata.name + metadata.extension):
            raise BadRequest('%s already exists.' % metadata.name + metadata.extension)

        digest_ = sha224(file_data).hexdigest()
        if digest:
            validate_equal(digest,digest_,"The provided digest does not match the file's digest. Ensure you are using sha224.")
        else:
            digest = digest_

        extension = metadata.extension
        if '.' in metadata.name:
            t = metadata.name.split('.')
            metadata.name, metadata.extension = ('.'.join(t[:-1]), '.' + t[-1])
        url = FileSystem.get_hierarchical_url(FS.CACHE, digest, extension)
        try:
            with open(url,'w+b') as f:
                f.write(file_data)
                f.close()
        except Exception:
            log.exception('Failed to write %s', url)
            raise BadRequest('Could not successfully write file data')
        if metadata.name[0] != '/':
            metadata.name = '/' + metadata.name
        metadata.url = url
        metadata.digest = digest
        metadata.created_date = IonTime().to_string()
        metadata.modified_date = IonTime().to_string()
        metadata.size = len(file_data)

        doc_id, rev_id = ds.create(metadata)
        return doc_id
Ejemplo n.º 9
0
    def process_stream(self, packet, dset_config):
        """
        Accepts a stream. Also accepts instruction (a dset_config). According to the received dset_config it processes the
        stream such as store in hfd_storage, couch_storage.
        @param: packet The incoming data stream of type stream.
        @param: dset_config The dset_config telling this method what to do with the incoming data stream.
        """

        ingestion_attributes = {
            'variables': [],
            'number_of_records': -1,
            'updated_metadata': False,
            'updated_data': False
        }

        if dset_config is None:
            log.info('No dataset config for this stream!')
            return

        values_string = ''
        sha1 = ''
        encoding_type = ''

        for key, value in packet.identifiables.iteritems():
            if isinstance(value, DataStream):
                values_string = value.values
                value.values = ''

            elif isinstance(value, Encoding):
                sha1 = value.sha1
                encoding_type = value.encoding_type

            elif isinstance(value, Coverage):
                ingestion_attributes['variables'].append(key)

            elif isinstance(value, CountElement):
                ingestion_attributes['number_of_records'] = value.value

        if dset_config.archive_metadata is True:
            log.debug("Persisting data....")
            ingestion_attributes['updated_metadata'] = True
            self.persist_immutable(packet)

        if dset_config.archive_data is True:
            #@todo - grab the filepath to save the hdf string somewhere..

            ingestion_attributes['updated_data'] = True
            if values_string:

                calculated_sha1 = hashlib.sha1(
                    values_string).hexdigest().upper()

                filename = FileSystem.get_hierarchical_url(
                    FS.CACHE, calculated_sha1, ".%s" % encoding_type)

                if sha1 != calculated_sha1:
                    raise IngestionWorkerException(
                        'The sha1 stored is different than the calculated from the received hdf_string'
                    )

                #log.warn('writing to filename: %s' % filename)

                with open(filename, mode='wb') as f:
                    f.write(values_string)
                    f.close()
            else:
                log.warn("Nothing to write!")

        return ingestion_attributes
 def _get_coverage(cls,dataset_id):
     filename = FileSystem.get_hierarchical_url(FS.CACHE, dataset_id, '.cov')
     coverage = SimplexCoverage.load(filename)
     return coverage
 def _persist_coverage(cls, dataset_id, coverage):
     validate_is_instance(coverage,SimplexCoverage,'Coverage is not an instance of SimplexCoverage: %s' % type(coverage))
     filename = FileSystem.get_hierarchical_url(FS.CACHE, dataset_id, '.cov')
     SimplexCoverage.save(coverage, filename, use_ascii=False)
Ejemplo n.º 12
0
    def _merge(self, msgs):
        '''
        @brief Merges all the granules and datasets into one large dataset (Union)
        @param msgs raw granules from couch
        @return complete dataset
        @description
             n
        D := U [ msgs_i ]
            i=0
        '''
        granule = None
        file_list = list()
        count = len(msgs)
        used_vals = list()

        #-------------------------------------------------------------------------------------
        # Merge each granule to another granule one by one.
        # After each merge operation keep track of what files belong where on the timeline
        #-------------------------------------------------------------------------------------

        for i in xrange(count):
            if i == 0:
                granule = msgs[0]['granule']
                psc = PointSupplementConstructor(
                    point_definition=self.definition)

                res = ReplayProcess.merge_granule(definition=self.definition,
                                                  granule1=granule,
                                                  granule2=None)
                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][
                        0] not in used_vals:
                    file_list.append(tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])

            else:
                res = ReplayProcess.merge_granule(definition=self.definition,
                                                  granule1=granule,
                                                  granule2=msgs[i]['granule'])

                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][
                        0] not in used_vals:
                    file_list.append(tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])
                if file_pair[1] not in file_list and file_pair[1][
                        0] not in used_vals:
                    file_list.append(tuple(file_pair[1]))
                    used_vals.append(file_pair[1][0])

        if not granule:
            return
        log.debug('file_list: %s', file_list)
        #-------------------------------------------------------------------------------------
        # Order the lists using a stable sort from python (by the first value in the tuples
        # Then peel off just the file names
        # Then get the appropriate URL for the file using FileSystem
        #-------------------------------------------------------------------------------------
        file_list.sort()
        file_list = list(i[1] for i in file_list)
        file_list = list([
            FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i)
            for i in file_list
        ])

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        codec = HDFEncoder()
        log.debug('acquire_data:')
        log.debug('\tfile_list: %s', file_list)
        log.debug('\tfields: %s', var_names)
        log.debug('\trecords: %s', record_count)

        data = acquire_data(file_list, var_names, record_count).next()

        for row, value in data.iteritems():
            value_path = self._find_vp(pairs, row)
            codec.add_hdf_dataset(value_path, nparray=value['values'])
            #-------------------------------------------------------------------------------------
            # Debugging
            #-------------------------------------------------------------------------------------
            log.debug('row: %s', row)
            log.debug('value path: %s', value_path)
            log.debug('value: %s', value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule, hdf_string)
        return granule
Ejemplo n.º 13
0
    def process_stream(self, packet, dset_config):
        """
        Accepts a stream. Also accepts instruction (a dset_config). According to the received dset_config it processes the
        stream such as store in hfd_storage, couch_storage.
        @param: packet The incoming data stream of type stream.
        @param: dset_config The dset_config telling this method what to do with the incoming data stream.
        """


        ingestion_attributes={'variables':[], 'number_of_records':-1,'updated_metadata':False, 'updated_data':False}

        if dset_config is None:
            log.info('No dataset config for this stream!')
            return

        values_string = ''
        sha1 = ''
        encoding_type = ''

        for key,value in packet.identifiables.iteritems():
            if isinstance(value, DataStream):
                values_string = value.values
                value.values=''

            elif isinstance(value, Encoding):
                sha1 = value.sha1
                encoding_type = value.encoding_type

            elif isinstance(value, Coverage):
                ingestion_attributes['variables'].append(key)

            elif isinstance(value, CountElement):
                ingestion_attributes['number_of_records'] = value.value

        if dset_config.archive_metadata is True:
            log.debug("Persisting data....")
            ingestion_attributes['updated_metadata'] = True
            self.persist_immutable(packet )

        if dset_config.archive_data is True:
            #@todo - grab the filepath to save the hdf string somewhere..

            ingestion_attributes['updated_data'] = True
            if values_string:

                calculated_sha1 = hashlib.sha1(values_string).hexdigest().upper()

                filename = FileSystem.get_hierarchical_url(FS.CACHE, calculated_sha1, ".%s" % encoding_type)

                if sha1 != calculated_sha1:
                    raise  IngestionWorkerException('The sha1 stored is different than the calculated from the received hdf_string')

                #log.warn('writing to filename: %s' % filename)

                with open(filename, mode='wb') as f:
                    f.write(values_string)
                    f.close()
            else:
                log.warn("Nothing to write!")


        return ingestion_attributes
Ejemplo n.º 14
0
    def test_raw_stream_integration(self):
        cc = self.container
        assertions = self.assertTrue

        #-----------------------------
        # Copy below here to run as a script (don't forget the imports of course!)
        #-----------------------------

        # Create some service clients...
        pubsub_management_service = PubsubManagementServiceClient(node=cc.node)
        ingestion_management_service = IngestionManagementServiceClient(
            node=cc.node)
        dataset_management_service = DatasetManagementServiceClient(
            node=cc.node)
        process_dispatcher = ProcessDispatcherServiceClient(node=cc.node)

        # declare some handy variables

        datastore_name = 'test_dm_integration'

        ###
        ### In the beginning there was one stream definitions...
        ###
        # create a stream definition for the data from the ctd simulator
        raw_ctd_stream_def = SBE37_RAW_stream_definition()
        raw_ctd_stream_def_id = pubsub_management_service.create_stream_definition(
            container=raw_ctd_stream_def, name='Simulated RAW CTD data')

        ###
        ### And two process definitions...
        ###
        # one for the ctd simulator...
        producer_definition = ProcessDefinition()
        producer_definition.executable = {
            'module': 'ion.processes.data.raw_stream_publisher',
            'class': 'RawStreamPublisher'
        }

        raw_ctd_sim_procdef_id = process_dispatcher.create_process_definition(
            process_definition=producer_definition)

        #---------------------------
        # Set up ingestion - this is an operator concern - not done by SA in a deployed system
        #---------------------------
        # Configure ingestion using eight workers, ingesting to test_dm_integration datastore with the SCIDATA profile
        log.debug('Calling create_ingestion_configuration')
        ingestion_configuration_id = ingestion_management_service.create_ingestion_configuration(
            exchange_point_id='science_data',
            couch_storage=CouchStorage(datastore_name=datastore_name,
                                       datastore_profile='SCIDATA'),
            number_of_workers=1)
        #
        ingestion_management_service.activate_ingestion_configuration(
            ingestion_configuration_id=ingestion_configuration_id)

        #---------------------------
        # Set up the producer (CTD Simulator)
        #---------------------------

        # Create the stream
        raw_ctd_stream_id = pubsub_management_service.create_stream(
            stream_definition_id=raw_ctd_stream_def_id)

        # Set up the datasets
        raw_ctd_dataset_id = dataset_management_service.create_dataset(
            stream_id=raw_ctd_stream_id,
            datastore_name=datastore_name,
            view_name='datasets/stream_join_granule')

        # Configure ingestion of this dataset
        raw_ctd_dataset_config_id = ingestion_management_service.create_dataset_configuration(
            dataset_id=raw_ctd_dataset_id,
            archive_data=True,
            archive_metadata=True,
            ingestion_configuration_id=
            ingestion_configuration_id,  # you need to know the ingestion configuration id!
        )
        # Hold onto ctd_dataset_config_id if you want to stop/start ingestion of that dataset by the ingestion service

        # Start the ctd simulator to produce some data
        configuration = {
            'process': {
                'stream_id': raw_ctd_stream_id,
            }
        }
        raw_sim_pid = process_dispatcher.schedule_process(
            process_definition_id=raw_ctd_sim_procdef_id,
            configuration=configuration)

        ###
        ### Make a subscriber in the test to listen for salinity data
        ###
        raw_subscription_id = pubsub_management_service.create_subscription(
            query=StreamQuery([
                raw_ctd_stream_id,
            ]),
            exchange_name='raw_test',
            name="test raw subscription",
        )

        # this is okay - even in cei mode!
        pid = cc.spawn_process(name='dummy_process_for_test',
                               module='pyon.ion.process',
                               cls='SimpleProcess',
                               config={})
        dummy_process = cc.proc_manager.procs[pid]

        subscriber_registrar = StreamSubscriberRegistrar(process=dummy_process,
                                                         node=cc.node)

        result = gevent.event.AsyncResult()
        results = []

        def message_received(message, headers):
            # Heads
            log.warn('Raw data received!')
            results.append(message)
            if len(results) > 3:
                result.set(True)

        subscriber = subscriber_registrar.create_subscriber(
            exchange_name='raw_test', callback=message_received)
        subscriber.start()

        # after the queue has been created it is safe to activate the subscription
        pubsub_management_service.activate_subscription(
            subscription_id=raw_subscription_id)

        # Assert that we have received data
        assertions(result.get(timeout=10))

        # stop the flow parse the messages...
        process_dispatcher.cancel_process(
            raw_sim_pid
        )  # kill the ctd simulator process - that is enough data

        gevent.sleep(1)

        for message in results:

            sha1 = message.identifiables['stream_encoding'].sha1

            data = message.identifiables['data_stream'].values

            filename = FileSystem.get_hierarchical_url(FS.CACHE, sha1, ".raw")

            with open(filename, 'r') as f:

                assertions(data == f.read())
Ejemplo n.º 15
0
    def _merge(self, msgs):
        '''
        @brief Merges all the granules and datasets into one large dataset (Union)
        @param msgs raw granules from couch
        @return complete dataset
        @description
             n
        D := U [ msgs_i ]
            i=0
        '''
        granule = None
        file_list = list()
        count = len(msgs)
        used_vals = list()

        #-------------------------------------------------------------------------------------
        # Merge each granule to another granule one by one.
        # After each merge operation keep track of what files belong where on the timeline
        #-------------------------------------------------------------------------------------


        for i in xrange(count):
            if i==0:
                granule = msgs[0]['granule']
                psc = PointSupplementConstructor(point_definition=self.definition)

                res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=None)
                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][0] not in used_vals:
                    file_list.append( tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])


            else:
                res = ReplayProcess.merge_granule(definition=self.definition, granule1=granule, granule2=msgs[i]['granule'])

                granule = res['granule']
                file_pair = res['files']
                log.debug('file_pair: %s', file_pair)

                if file_pair[0] not in file_list and file_pair[0][0] not in used_vals:
                    file_list.append( tuple(file_pair[0]))
                    used_vals.append(file_pair[0][0])
                if file_pair[1] not in file_list and file_pair[1][0] not in used_vals:
                    file_list.append(tuple(file_pair[1]))
                    used_vals.append(file_pair[1][0])

        if not granule:
            return
        log.debug('file_list: %s', file_list)
        #-------------------------------------------------------------------------------------
        # Order the lists using a stable sort from python (by the first value in the tuples
        # Then peel off just the file names
        # Then get the appropriate URL for the file using FileSystem
        #-------------------------------------------------------------------------------------
        file_list.sort()
        file_list = list(i[1] for i in file_list)
        file_list = list([FileSystem.get_hierarchical_url(FS.CACHE, '%s' % i) for i in file_list])

        pairs = self._pair_up(granule)
        var_names = list([i[0] for i in pairs])

        record_count = granule.identifiables[self.element_count_id].value
        codec = HDFEncoder()
        log.debug('acquire_data:')
        log.debug('\tfile_list: %s', file_list)
        log.debug('\tfields: %s', var_names)
        log.debug('\trecords: %s', record_count)

        data = acquire_data(file_list, var_names, record_count).next()

        for row,value in data.iteritems():
            value_path = self._find_vp(pairs,row)
            codec.add_hdf_dataset(value_path,nparray=value['values'])
            #-------------------------------------------------------------------------------------
            # Debugging
            #-------------------------------------------------------------------------------------
            log.debug('row: %s', row)
            log.debug('value path: %s', value_path)
            log.debug('value: %s', value['values'])

        hdf_string = codec.encoder_close()
        self._patch_granule(granule,hdf_string)
        return granule