def testPush(self):
        """This tests the push ingest by creating a test dir, populating it, then forcing the ingester to run
        """
        staging = tempfile.mkdtemp()
        self.todelete.append(staging)

        # Create a temp file to ingest
        f_name = str(int(time.time()))
        f = open(os.path.join(staging, f_name), "a")
        f.close()

        # Check there is only 1 file here
        self.assertEquals(1, len(os.listdir(staging)))

        dataset = Dataset()
        dataset.id = 1
        dataset.data_source = PushDataSource(path=staging,
                                             pattern='^(?P<timestamp>[0-9]+)$')

        self.ingester.enqueue_ingress(dataset)
        self.ingester.process_ingress_queue(True)

        self.assertEquals(1, self.ingester._archive_queue.qsize())

        # Check there are now no files
        self.assertEquals(0, len(os.listdir(staging)))
 def setup_datasets(self):
     ds = Dataset()
     ds.data_source = _DataSource()
     ds.id = 1
     ds.data_source.__xmlrpc_class__ = "mock_data_source"
     ds.data_source.sampling = PeriodicSampling(10000)
     self.datasets.append(ds)
    def testPostProcessScript(self):
        """This test performs a complex data ingest, where the main data goes into dataset 1 and 
        the extracted data goes into dataset 2"""
        script = """import os
import datetime

from jcudc24ingesterapi.models.data_entry import DataEntry, FileObject

def process(cwd, data_entry):
    data_entry = data_entry[0]
    ret = []
    with open(os.path.join(cwd, data_entry["file1"].f_path)) as f:
        for l in f.readlines():
            l = l.strip().split(",")
            if len(l) != 2: continue
            new_data_entry = DataEntry(timestamp=datetime.datetime.now())
            new_data_entry["a"] = FileObject(f_path=l[1].strip())
            ret.append( new_data_entry )
    return ret
"""
        # Capture the ingests
        data_entries = DataEntryListener()
        self.ingester.service.register_observation_listener(data_entries)

        dataset = Dataset(dataset_id=1)
        dataset.data_source = _DataSource(processing_script=script)
        dataset.data_source.__xmlrpc_class__ = "csv1"

        self.ingester.enqueue_ingress(dataset)
        self.ingester.process_ingress_queue(True)
        self.assertEquals(0, self.ingester._ingress_queue.qsize())
        self.assertEquals(1, self.ingester._archive_queue.qsize())
        self.ingester.process_archive_queue(True)
        self.assertEquals(0, self.ingester._archive_queue.qsize())
        self.assertEquals(2, data_entries.count())
    def testBasicIngest(self):
        """This test performs a simple data ingest"""
        # Capture the ingests
        data_entries = DataEntryListener()
        self.ingester.service.register_observation_listener(data_entries)

        dataset = Dataset()
        dataset.id = 1
        datasource = _DataSource()
        datasource.__xmlrpc_class__ = "csv1"

        dataset.data_source = datasource

        self.ingester.enqueue_ingress(dataset)
        self.assertEquals(1, self.ingester._ingress_queue.qsize())

        self.ingester.process_ingress_queue(True)

        self.assertEquals(1, self.ingester._archive_queue.qsize())

        self.ingester.process_archive_queue(True)
        self.assertEquals(1, data_entries.count())
Exemple #5
0
    def test_dataset_data_source_unit(self):
        """This test creates a simple schema hierarchy, and tests updates, etc"""
        unit = UnitOfWork(None)

        schema1 = DataEntrySchema("base1")
        schema1.addAttr(FileDataType("file"))
        schema_id = unit.post(schema1)

        loc = Location(10.0, 11.0)
        loc.name = "Location"
        loc_id = unit.post(loc)

        dataset1 = Dataset()
        dataset1.schema = schema_id
        dataset1.location = loc_id
        dataset1_id = unit.post(dataset1)

        dataset2 = Dataset()
        dataset2.schema = schema_id
        dataset2.location = loc_id
        dataset2.data_source = DatasetDataSource(dataset1_id, "")
        dataset2_id = unit.post(dataset2)

        ret = self.service.commit(unit, None)

        found = False
        for r in ret:
            if isinstance(r, Dataset) and dataset1_id == r.correlationid:
                dataset1_id = r.id
            elif isinstance(r, Dataset) and dataset2_id == r.correlationid:
                self.assertEquals(dataset1_id, r.data_source.dataset_id,
                                  "Data source dataset_id was not updated")
                found = True

        self.assertTrue(
            found, "Didn't find the dataset with the dataset data source")
    def test_api_usage(self):
#       User data that is created by filling out the provisioning interface workflow steps.
        #   General
        title = "Test project"
        data_manager = "A Person"
        project_lead = "Another Person"

        #   Metadata
        project_region = Region("Test Region", ((1, 1), (2, 2),(2,1), (1,1)))

        #   Methods & Datasets
        loc1 = Location(11.0, 11.0, "Test Site", 100)
        loc2 = Location(11.0, 11.0, "Test Site", 100)
        loc3 = Location(12.0, 11.0, "Test Site", 100)

        temp_work = self.ingester_platform.createUnitOfWork()
        temperature_schema = DataEntrySchema("Test Temp Schema")
        temperature_schema.addAttr(Double("temperature"))
        temp_work.post(temperature_schema)
        temp_work.commit()

        air_temperature_schema = DataEntrySchema("Air Temp Schema")
        air_temperature_schema.extends = [temperature_schema.id]
        air_temperature_schema = self.ingester_platform.post(air_temperature_schema)

        second_level_inheritence_schema = DataEntrySchema("Second Inheritence")
        second_level_inheritence_schema.extends = [air_temperature_schema.id]
        second_level_inheritence_schema = self.ingester_platform.post(second_level_inheritence_schema)

        # Check the name is set
        temperature_schema_1 = self.ingester_platform.getSchema(temperature_schema.id)
        self.assertIsNotNone(temperature_schema.name)
        self.assertEquals(temperature_schema.name, temperature_schema_1.name)
        
        file_schema = DataEntrySchema()
        file_schema.addAttr(FileDataType("file"))
        file_schema = self.ingester_platform.post(file_schema)

        dataset1 = Dataset(location=None, schema=temperature_schema.id)
        dataset2 = Dataset(location=None, schema=file_schema.id, data_source=PullDataSource("http://test.com", "file_handle", processing_script="file://d:/processing_scripts/awsome_processing.py"))

#        dataset3 = Dataset(None, file_schema, PullDataSource("http://test.com", "file_handle"), CustomSampling("file://d:/sampling_scripts/awsome_sampling.py"), "file://d:/processing_scripts/awsome_processing.py")

        self.cleanup_files.append(dataset2.data_source.processing_script)
#        self.cleanup_files.push(dataset3.sampling.script)
#        self.cleanup_files.push(dataset3.processing_script)

#       Provisioning admin accepts the submitted project
        work = self.ingester_platform.createUnitOfWork()

        work.post(project_region)    # Save the region

        loc1.region = project_region.id                  # Set the datasets location to use the projects region
        work.post(loc1)                        # Save the location
        dataset1.location = loc1.id                            # Set the datasets location
        work.post(dataset1)                # Save the dataset

        loc2.region = project_region.id
        work.post(loc2)
        dataset2.location = loc2.id
        work.post(dataset2)

        work.commit()

        # Region, location and dataset id's will be saved to the project within the provisioning system in some way


#       User searches for datasets

        # TODO: Nigel? - Define searching api
        found_dataset_id = dataset1.id                  # The dataset that has an extended file schema

#       User manually enters data
        timestamp = datetime.datetime.now()
        data_entry_1 = DataEntry(found_dataset_id, timestamp)
        data_entry_1['temperature'] = 27.8                # Add the extended schema items
        data_entry_1 = self.ingester_platform.post(data_entry_1)
        self.assertIsNotNone(data_entry_1.id)

        timestamp2 = timestamp + datetime.timedelta(seconds=1)
        data_entry_2 = DataEntry(found_dataset_id, timestamp2)
        data_entry_2['temperature'] = 27.8                # Add the extended schema items
        data_entry_2 = self.ingester_platform.post(data_entry_2)
        
        self.assertEquals(2, len(self.ingester_platform.search(DataEntrySearchCriteria(found_dataset_id), 0, 10).results))
        result = self.ingester_platform.search(DataEntrySearchCriteria(found_dataset_id), 0, 1)
        self.assertEquals(2, result.count)
        self.assertEquals(1, len(result.results))
        self.assertEquals(1, len(self.ingester_platform.search(DataEntrySearchCriteria(found_dataset_id), 1, 1).results))
        
        result = self.ingester_platform.search(DataEntrySearchCriteria(found_dataset_id), 2, 1)
        self.assertEquals(0, len(result.results))
                
        self.assertEquals(0, len(self.ingester_platform.search(DataEntrySearchCriteria(found_dataset_id, 
                                 end_time=timestamp-datetime.timedelta(seconds=60)), 0, 10).results))
        self.assertEquals(0, len(self.ingester_platform.search(DataEntrySearchCriteria(found_dataset_id, 
                                 start_time=timestamp+datetime.timedelta(seconds=60)), 0, 10).results))
        self.assertEquals(2, len(self.ingester_platform.search(DataEntrySearchCriteria(found_dataset_id, 
                                 start_time=timestamp-datetime.timedelta(seconds=60),
                                 end_time=timestamp+datetime.timedelta(seconds=60)), 0, 10).results))

        work = self.ingester_platform.createUnitOfWork()
        data_entry_3 = DataEntry(dataset2.id, datetime.datetime.now())
        data_entry_3['file'] = FileObject(f_handle=open(os.path.join(
                    os.path.dirname(jcudc24ingesterapi.__file__), "tests/test_ingest.xml")), 
                    mime_type="text/xml")
        work.post(data_entry_3)
        work.commit()
        self.assertIsNotNone(data_entry_3.id)
        
        f_in = self.ingester_platform.getDataEntryStream(dataset2.id, data_entry_3.id, "file")
        self.assertIsNotNone(f_in)
        data = f_in.read()
        f_in.close()
        self.assertLess(0, len(data), "Expected data in file")

#       User enters quality assurance metadata
        quality_metadata_schema = DatasetMetadataSchema()
        quality_metadata_schema.addAttr(String("unit"))
        quality_metadata_schema.addAttr(String("description"))
        quality_metadata_schema.addAttr(Double("value"))
        quality_metadata_schema = self.ingester_platform.post(quality_metadata_schema)
        
        entered_metadata = DatasetMetadataEntry(data_entry_1.dataset, quality_metadata_schema.id)
        entered_metadata['unit'] = "%"
        entered_metadata['description'] = "Percent error"
        entered_metadata['value'] = 0.98

        entered_metadata = self.ingester_platform.post(entered_metadata)
        
        # Now find that metadata
        results = self.ingester_platform.search(DatasetMetadataSearchCriteria(data_entry_1.dataset),0 , 10).results
        self.assertEqual(1, len(results))
        
        
        data_entry_md_schema = DataEntryMetadataSchema("test")
        data_entry_md_schema.addAttr(String("description"))
        data_entry_md_schema.addAttr(Double("value"))
        data_entry_md_schema = self.ingester_platform.post(data_entry_md_schema)
        calibration = DataEntryMetadataEntry(metadata_schema_id=int(data_entry_md_schema.id), dataset_id=dataset2.id, object_id=data_entry_3.id)
        calibration["description"] = "Test"
        calibration["value"] = 1.2

        calibration2 = DataEntryMetadataEntry(metadata_schema_id=int(data_entry_md_schema.id), dataset_id=dataset2.id, object_id=data_entry_3.id)
        calibration2["description"] = "Test2"
        calibration2["value"] = 2.3
        calibration2 = self.ingester_platform.post(calibration2)

        calibrations = self.ingester_platform.search(DataEntryMetadataSearchCriteria(int(81), int(3648)), offset=0, limit=1000)
        self.assertEquals(1, len(calibrations.results))
        self.assertEquals(calibrations.results[0].schema_id, data_entry_md_schema.id)

        self.ingester_platform.delete(calibration2)
        self.ingester_platform.delete(calibration)
        self.ingester_platform.delete(data_entry_md_schema)

#       User changes sampling rate
# FIXME: This test is going to be changed to be done by editing the dataset
#        sampling_rate_changed = Metadata(dataset1.id, type(dataset1), SampleRateMetadataSchema())
#        sampling_rate_changed.change_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
#        sampling_rate_changed.sampling = CustomSampling("file://d:/sampling_scripts/awsome_sampling.py")
#
#        try:
#            sampling_rate_changed = self.ingester_platform.post(sampling_rate_changed)
#            assert(sampling_rate_changed.metadata_id is None, "Sampling rate change failed")
#        except:
#            assert(True, "Sampling rate change failed")

#       User wants some random metadata specific to their project
# FIXME: Not sure what use case this is trying to demonstrate
#        random_metadata_schema =  DataEntryMetadataSchema()
#        random_metadata_schema.addAttr('random_field', Double())

#        random_metadata = Metadata(data_entry.data_entry_id, type(data_entry), random_metadata_schema)
#        random_metadata.random_field = 1.5

#        try:
#            random_metadata = self.ingester_platform.post(random_metadata)
#            assert(random_metadata.metadata_id is None, "random_metadata failed")
#        except:
#            assert(True, "random_metadata failed")

#       User changes the data source of the dataset
        new_data_source = PullDataSource("http://test.com/new_data", "file_handle")
        dataset1.data_source = new_data_source
        dataset1 = self.ingester_platform.post(dataset1)
        self.assertNotEqual(None, dataset1)

#       External, 3rd party searches for data
        # TODO: external 3rd parties should be able to use the api to get data without authentication
        # TODO: I'm not sure exactly how this should work, but the search api could be open access (need spam limitations or something?)

#       Project is disabled/finished
        work = self.ingester_platform.createUnitOfWork()
        work.disable(dataset1.id)
        work.disable(dataset2.id)
        work.commit()

#       Project is obsolete and data should be deleted
        work = self.ingester_platform.createUnitOfWork()
        work.delete(dataset1.id)
        work.delete(dataset2.id)
        work.commit()
    def test_api_usage(self):
        #       User data that is created by filling out the provisioning interface workflow steps.
        #   General
        title = "Test project"
        data_manager = "A Person"
        project_lead = "Another Person"

        #   Metadata
        project_region = Region("Test Region",
                                ((1, 1), (2, 2), (2, 1), (1, 1)))

        #   Methods & Datasets
        loc1 = Location(11.0, 11.0, "Test Site", 100)
        loc2 = Location(11.0, 11.0, "Test Site", 100)
        loc3 = Location(12.0, 11.0, "Test Site", 100)

        temp_work = self.ingester_platform.createUnitOfWork()
        temperature_schema = DataEntrySchema("Test Temp Schema")
        temperature_schema.addAttr(Double("temperature"))
        temp_work.post(temperature_schema)
        temp_work.commit()

        air_temperature_schema = DataEntrySchema("Air Temp Schema")
        air_temperature_schema.extends = [temperature_schema.id]
        air_temperature_schema = self.ingester_platform.post(
            air_temperature_schema)

        second_level_inheritence_schema = DataEntrySchema("Second Inheritence")
        second_level_inheritence_schema.extends = [air_temperature_schema.id]
        second_level_inheritence_schema = self.ingester_platform.post(
            second_level_inheritence_schema)

        # Check the name is set
        temperature_schema_1 = self.ingester_platform.getSchema(
            temperature_schema.id)
        self.assertIsNotNone(temperature_schema.name)
        self.assertEquals(temperature_schema.name, temperature_schema_1.name)

        file_schema = DataEntrySchema()
        file_schema.addAttr(FileDataType("file"))
        file_schema = self.ingester_platform.post(file_schema)

        dataset1 = Dataset(location=None, schema=temperature_schema.id)
        dataset2 = Dataset(
            location=None,
            schema=file_schema.id,
            data_source=PullDataSource(
                "http://test.com",
                "file_handle",
                processing_script=
                "file://d:/processing_scripts/awsome_processing.py"))

        #        dataset3 = Dataset(None, file_schema, PullDataSource("http://test.com", "file_handle"), CustomSampling("file://d:/sampling_scripts/awsome_sampling.py"), "file://d:/processing_scripts/awsome_processing.py")

        self.cleanup_files.append(dataset2.data_source.processing_script)
        #        self.cleanup_files.push(dataset3.sampling.script)
        #        self.cleanup_files.push(dataset3.processing_script)

        #       Provisioning admin accepts the submitted project
        work = self.ingester_platform.createUnitOfWork()

        work.post(project_region)  # Save the region

        loc1.region = project_region.id  # Set the datasets location to use the projects region
        work.post(loc1)  # Save the location
        dataset1.location = loc1.id  # Set the datasets location
        work.post(dataset1)  # Save the dataset

        loc2.region = project_region.id
        work.post(loc2)
        dataset2.location = loc2.id
        work.post(dataset2)

        work.commit()

        # Region, location and dataset id's will be saved to the project within the provisioning system in some way

        #       User searches for datasets

        # TODO: Nigel? - Define searching api
        found_dataset_id = dataset1.id  # The dataset that has an extended file schema

        #       User manually enters data
        timestamp = datetime.datetime.now()
        data_entry_1 = DataEntry(found_dataset_id, timestamp)
        data_entry_1['temperature'] = 27.8  # Add the extended schema items
        data_entry_1 = self.ingester_platform.post(data_entry_1)
        self.assertIsNotNone(data_entry_1.id)

        timestamp2 = timestamp + datetime.timedelta(seconds=1)
        data_entry_2 = DataEntry(found_dataset_id, timestamp2)
        data_entry_2['temperature'] = 27.8  # Add the extended schema items
        data_entry_2 = self.ingester_platform.post(data_entry_2)

        self.assertEquals(
            2,
            len(
                self.ingester_platform.search(
                    DataEntrySearchCriteria(found_dataset_id), 0, 10).results))
        result = self.ingester_platform.search(
            DataEntrySearchCriteria(found_dataset_id), 0, 1)
        self.assertEquals(2, result.count)
        self.assertEquals(1, len(result.results))
        self.assertEquals(
            1,
            len(
                self.ingester_platform.search(
                    DataEntrySearchCriteria(found_dataset_id), 1, 1).results))

        result = self.ingester_platform.search(
            DataEntrySearchCriteria(found_dataset_id), 2, 1)
        self.assertEquals(0, len(result.results))

        self.assertEquals(
            0,
            len(
                self.ingester_platform.search(
                    DataEntrySearchCriteria(found_dataset_id,
                                            end_time=timestamp -
                                            datetime.timedelta(seconds=60)), 0,
                    10).results))
        self.assertEquals(
            0,
            len(
                self.ingester_platform.search(
                    DataEntrySearchCriteria(found_dataset_id,
                                            start_time=timestamp +
                                            datetime.timedelta(seconds=60)), 0,
                    10).results))
        self.assertEquals(
            2,
            len(
                self.ingester_platform.search(
                    DataEntrySearchCriteria(
                        found_dataset_id,
                        start_time=timestamp - datetime.timedelta(seconds=60),
                        end_time=timestamp + datetime.timedelta(seconds=60)),
                    0, 10).results))

        work = self.ingester_platform.createUnitOfWork()
        data_entry_3 = DataEntry(dataset2.id, datetime.datetime.now())
        data_entry_3['file'] = FileObject(f_handle=open(
            os.path.join(os.path.dirname(jcudc24ingesterapi.__file__),
                         "tests/test_ingest.xml")),
                                          mime_type="text/xml")
        work.post(data_entry_3)
        work.commit()
        self.assertIsNotNone(data_entry_3.id)

        f_in = self.ingester_platform.getDataEntryStream(
            dataset2.id, data_entry_3.id, "file")
        self.assertIsNotNone(f_in)
        data = f_in.read()
        f_in.close()
        self.assertLess(0, len(data), "Expected data in file")

        #       User enters quality assurance metadata
        quality_metadata_schema = DatasetMetadataSchema()
        quality_metadata_schema.addAttr(String("unit"))
        quality_metadata_schema.addAttr(String("description"))
        quality_metadata_schema.addAttr(Double("value"))
        quality_metadata_schema = self.ingester_platform.post(
            quality_metadata_schema)

        entered_metadata = DatasetMetadataEntry(data_entry_1.dataset,
                                                quality_metadata_schema.id)
        entered_metadata['unit'] = "%"
        entered_metadata['description'] = "Percent error"
        entered_metadata['value'] = 0.98

        entered_metadata = self.ingester_platform.post(entered_metadata)

        # Now find that metadata
        results = self.ingester_platform.search(
            DatasetMetadataSearchCriteria(data_entry_1.dataset), 0, 10).results
        self.assertEqual(1, len(results))

        data_entry_md_schema = DataEntryMetadataSchema("test")
        data_entry_md_schema.addAttr(String("description"))
        data_entry_md_schema.addAttr(Double("value"))
        data_entry_md_schema = self.ingester_platform.post(
            data_entry_md_schema)
        calibration = DataEntryMetadataEntry(metadata_schema_id=int(
            data_entry_md_schema.id),
                                             dataset_id=dataset2.id,
                                             object_id=data_entry_3.id)
        calibration["description"] = "Test"
        calibration["value"] = 1.2

        calibration2 = DataEntryMetadataEntry(metadata_schema_id=int(
            data_entry_md_schema.id),
                                              dataset_id=dataset2.id,
                                              object_id=data_entry_3.id)
        calibration2["description"] = "Test2"
        calibration2["value"] = 2.3
        calibration2 = self.ingester_platform.post(calibration2)

        calibrations = self.ingester_platform.search(
            DataEntryMetadataSearchCriteria(int(81), int(3648)),
            offset=0,
            limit=1000)
        self.assertEquals(1, len(calibrations.results))
        self.assertEquals(calibrations.results[0].schema_id,
                          data_entry_md_schema.id)

        self.ingester_platform.delete(calibration2)
        self.ingester_platform.delete(calibration)
        self.ingester_platform.delete(data_entry_md_schema)

        #       User changes sampling rate
        # FIXME: This test is going to be changed to be done by editing the dataset
        #        sampling_rate_changed = Metadata(dataset1.id, type(dataset1), SampleRateMetadataSchema())
        #        sampling_rate_changed.change_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
        #        sampling_rate_changed.sampling = CustomSampling("file://d:/sampling_scripts/awsome_sampling.py")
        #
        #        try:
        #            sampling_rate_changed = self.ingester_platform.post(sampling_rate_changed)
        #            assert(sampling_rate_changed.metadata_id is None, "Sampling rate change failed")
        #        except:
        #            assert(True, "Sampling rate change failed")

        #       User wants some random metadata specific to their project
        # FIXME: Not sure what use case this is trying to demonstrate
        #        random_metadata_schema =  DataEntryMetadataSchema()
        #        random_metadata_schema.addAttr('random_field', Double())

        #        random_metadata = Metadata(data_entry.data_entry_id, type(data_entry), random_metadata_schema)
        #        random_metadata.random_field = 1.5

        #        try:
        #            random_metadata = self.ingester_platform.post(random_metadata)
        #            assert(random_metadata.metadata_id is None, "random_metadata failed")
        #        except:
        #            assert(True, "random_metadata failed")

        #       User changes the data source of the dataset
        new_data_source = PullDataSource("http://test.com/new_data",
                                         "file_handle")
        dataset1.data_source = new_data_source
        dataset1 = self.ingester_platform.post(dataset1)
        self.assertNotEqual(None, dataset1)

        #       External, 3rd party searches for data
        # TODO: external 3rd parties should be able to use the api to get data without authentication
        # TODO: I'm not sure exactly how this should work, but the search api could be open access (need spam limitations or something?)

        #       Project is disabled/finished
        work = self.ingester_platform.createUnitOfWork()
        work.disable(dataset1.id)
        work.disable(dataset2.id)
        work.commit()

        #       Project is obsolete and data should be deleted
        work = self.ingester_platform.createUnitOfWork()
        work.delete(dataset1.id)
        work.delete(dataset2.id)
        work.commit()