def _copy_attrs(self, attrs, dst):
     for attr in attrs:
         if "size" in attr:
             fo = FileObject()
             fo.f_name = attr["name"]
             fo.mime_type = attr["mimeType"]
             fo.file_name = attr["originalFileName"]
             dst.data[attr["name"]] = fo
         else:
             dst.data[attr["name"]] = attr["value"]
Esempio n. 2
0
 def _copy_attrs(self, attrs, dst):
     for attr in attrs:
         if "size" in attr:
             fo = FileObject()
             fo.f_name = attr["name"]
             fo.mime_type = attr["mimeType"]
             fo.file_name = attr["originalFileName"]
             dst.data[attr["name"]] = fo
         else:
             dst.data[attr["name"]] = attr["value"]
    def persist_data_entry_metadata(self, data_entry, schema, attrs, cwd):
        # Check the attributes are actually in the schema
        self.validate_schema(attrs, schema.attrs)
        s = orm.sessionmaker(bind=self.engine)()
        try:
            md = DataEntryMetadata()
            md.data_entry = data_entry.id
            md.schema = schema.id

            s.add(md)
            s.flush()

            # Copy all files into place
            self.copy_files(attrs, schema.attrs, cwd, md,
                            "data_entry_metadata")

            merge_parameters(md.attrs, attrs, DataEntryMetadataAttr)
            s.merge(md)
            s.flush()
            s.commit()

            entry = DataEntryMetadataEntry(object_id=md.data_entry,
                                           metadata_schema_id=md.schema,
                                           id=md.id)
            for attr in md.attrs:
                if isinstance(schema.attrs[attr.name], FileDataType):
                    entry[attr.name] = FileObject(f_path=attr.value)
                else:
                    entry[attr.name] = attr.value
            return entry

        finally:
            s.close()
Esempio n. 4
0
    def fetch_observations(self, sos, caps, cwd, ret):
        insert_dir = os.path.join(cwd, "observations")
        if not os.path.exists(insert_dir):
            os.makedirs(insert_dir)

        for observationID in caps.createRangeGenerator():
            if observationID not in self.state['observations']:
                logger.debug("GetObservationByID for %s" % observationID)
                sos_obs = sos.getObservationByID(observationID,
                                                 "om:Observation")
                obs_path = os.path.join(insert_dir, "%s.xml" % observationID)
                with open(obs_path, "wb") as output:
                    output.write(sos_obs.getXMLString())
                    timestamp = sos_obs.getTimestamp()
                    new_data_entry = DataEntry(timestamp=timestamp)
                    new_data_entry[self.field] = FileObject(
                        f_path=obs_path, mime_type=SOSMimeTypes.om_1_0_0)
                    ret.append(new_data_entry)
                self.state['observations'].append(observationID)
                self.state['observation_map'][sos_obs.getSensorID()].append(
                    observationID)
            else:
                logger.debug(
                    "GetObservationByID for %s already retrieved, ignoring." %
                    observationID)
Esempio n. 5
0
    def fetch_single(self, cwd):
        """Fetch a single resource from a URL"""
        req = urllib2.Request(self.url)
        f_out_name = os.path.join(cwd, "outputfile")
        f_in = None
        try:
            f_in = urllib2.urlopen(req)
            timestamp = parse_timestamp_rfc_2822(f_in.headers["Last-Modified"]) if "Last-Modified" in f_in.headers \
                else datetime.datetime.now()
            with file(f_out_name, "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)

            self.state["lasttime"] = format_timestamp(timestamp)
        finally:
            if f_in != None: f_in.close()
        new_data_entry = DataEntry(timestamp=timestamp)

        file_name = None
        try:
            file_name = self.url.split("/")[-1]
        except:
            pass

        new_data_entry[self.field] = FileObject(f_path="outputfile",
                                                mime_type="",
                                                file_name=file_name)

        return [new_data_entry]
    def fetch(self, cwd, service=None):
        with open(os.path.join(cwd, "file1"), "w") as f:
            f.write("2,55\n3,2\n")

        data_entry = DataEntry(timestamp=datetime.datetime.now())
        data_entry["file1"] = FileObject("file1")

        return [data_entry]
    def testScript(self):
        file1 = "1\n2\n"
        with open(os.path.join(self.cwd, "file1"), "w") as f:
            f.write(file1)
        data_entry = DataEntry(timestamp=datetime.datetime.now())
        data_entry["file1"] = FileObject("file1")

        script = """def process(cwd, data_entry):
    return [data_entry, None, None]
"""
        new_entries = run_script(script, self.cwd, data_entry)

        self.assertEquals(3, len(new_entries))
 def _create_data_entry(self, obs, schema):
     """Internal method for creating the DataEntry domain object from a database
     observation
     """
     entry = DataEntry()
     entry.dataset = obs.dataset
     entry.id = obs.id
     entry.timestamp = obs.timestamp
     for attr in obs.attrs:
         if isinstance(schema.attrs[attr.name], FileDataType):
             entry[attr.name] = FileObject(f_path=attr.value)
         else:
             entry[attr.name] = attr.value
     return entry
    def test_file_object_roundtrip(self):
        """The file object should marshall everything but the file stream"""
        data_entry = DataEntry(1)
        data_entry["temp"] = FileObject(f_path=os.path.join(
            os.path.dirname(jcudc24ingesterapi.__file__),
            "tests/test_ingest.xml"),
                                        mime_type="text/xml")

        data_entry_dto = self.marshaller.obj_to_dict(data_entry)
        self.assertEqual("text/xml",
                         data_entry_dto["data"]["temp"]["mime_type"])

        data_entry_domain = self.marshaller.dict_to_obj(data_entry_dto)
        self.assertEqual("text/xml", data_entry_domain["temp"].mime_type)
    def _create_data_entry_metadata(self, session, obj):
        """Internal method for creating the DataEntry domain object from a database
        observation
        """
        schema = ConcreteSchema(self.service.get_schema_tree(obj.schema))

        entry = DataEntryMetadataEntry()
        entry.metadata_schema = obj.schema
        entry.id = obj.id
        entry.object_id = obj.data_entry
        for attr in obj.attrs:
            if isinstance(schema.attrs[attr.name], FileDataType):
                entry[attr.name] = FileObject(f_path=attr.value)
            else:
                entry[attr.name] = attr.value
        return entry
Esempio n. 11
0
    def fetch(self, cwd, service=None):
        """Scans a folder to find new files. The filenames are UTC timestamps that used
        as the timestamp for these samples.
        
        :param cwd: working directory to place binary data
        :returns: dict containing the data to be ingested
        """
        if not hasattr(self, "path"):
            raise DataSourceError("Path not set")
        if not os.path.exists(self.path):
            raise DataSourceError("Could not find the staging path")

        start_time = datetime.datetime.utcnow()

        # When the file should have been modified since
        since = None
        if "lasttime" in self.state and self.state["lasttime"] != None and len(
                self.state["lasttime"]) > 0:
            since = calendar.timegm(
                parse_timestamp(self.state["lasttime"]).timetuple())

        ret = []
        for f_name in os.listdir(self.path):
            timestamp = self.match_filename(f_name)
            if timestamp == None: continue

            logger.debug("%s %s" % (str(timestamp), f_name))

            new_filename = "file-" + f_name
            if self.archive != None:
                shutil.copyfile(os.path.join(self.path, f_name),
                                os.path.join(self.archive, f_name))
            shutil.move(os.path.join(self.path, f_name),
                        os.path.join(cwd, new_filename))
            #timestamp = datetime.datetime.utcfromtimestamp(int(m.group(1)))
            new_data_entry = DataEntry(timestamp=timestamp)
            new_data_entry[self.field] = FileObject(f_path=new_filename,
                                                    file_name=f_name,
                                                    mime_type="")
            ret.append(new_data_entry)

        self.state["lasttime"] = format_timestamp(
            since) if since != None else None

        return ret
Esempio n. 12
0
    def fetch_sensorml(self, sos, caps, cwd, ret):
        sensorIDS = caps.getSensorIDs()
        sensorml_dir = os.path.join(cwd, "sensorml")
        if not os.path.exists(sensorml_dir):
            os.makedirs(sensorml_dir)

        for sensorID in sensorIDS:
            if sensorID not in self.state['sensorml']:
                logger.debug("Getting SensorML for %s" % sensorID)
                sml = sos.describeSensor(sensorID)
                sml_path = os.path.join(sensorml_dir, sensorID)
                with open(sml_path, "wb") as sensorml:
                    sensorml.write(sml.getXMLString())
                    timestamp = datetime.datetime.now()
                    new_data_entry = DataEntry(timestamp=timestamp)
                    new_data_entry[self.field] = FileObject(
                        f_path=sml_path, mime_type=SOSMimeTypes.sensorML_1_0_1)
                    ret.append(new_data_entry)
                self.state['sensorml'].append(sensorID)
            else:
                logger.debug("SensorML for %s already exists, ignoring." %
                             sensorID)
Esempio n. 13
0
    def test_api_usage(self):
        #       User data that is created by filling out the provisioning interface workflow steps.
        #   General
        title = "Test project"
        data_manager = "A Person"
        project_lead = "Another Person"

        #   Metadata
        project_region = Region("Test Region",
                                ((1, 1), (2, 2), (2, 1), (1, 1)))

        #   Methods & Datasets
        loc1 = Location(11.0, 11.0, "Test Site", 100)
        loc2 = Location(11.0, 11.0, "Test Site", 100)
        loc3 = Location(12.0, 11.0, "Test Site", 100)

        temp_work = self.ingester_platform.createUnitOfWork()
        temperature_schema = DataEntrySchema("Test Temp Schema")
        temperature_schema.addAttr(Double("temperature"))
        temp_work.post(temperature_schema)
        temp_work.commit()

        air_temperature_schema = DataEntrySchema("Air Temp Schema")
        air_temperature_schema.extends = [temperature_schema.id]
        air_temperature_schema = self.ingester_platform.post(
            air_temperature_schema)

        second_level_inheritence_schema = DataEntrySchema("Second Inheritence")
        second_level_inheritence_schema.extends = [air_temperature_schema.id]
        second_level_inheritence_schema = self.ingester_platform.post(
            second_level_inheritence_schema)

        # Check the name is set
        temperature_schema_1 = self.ingester_platform.getSchema(
            temperature_schema.id)
        self.assertIsNotNone(temperature_schema.name)
        self.assertEquals(temperature_schema.name, temperature_schema_1.name)

        file_schema = DataEntrySchema()
        file_schema.addAttr(FileDataType("file"))
        file_schema = self.ingester_platform.post(file_schema)

        dataset1 = Dataset(location=None, schema=temperature_schema.id)
        dataset2 = Dataset(
            location=None,
            schema=file_schema.id,
            data_source=PullDataSource(
                "http://test.com",
                "file_handle",
                processing_script=
                "file://d:/processing_scripts/awsome_processing.py"))

        #        dataset3 = Dataset(None, file_schema, PullDataSource("http://test.com", "file_handle"), CustomSampling("file://d:/sampling_scripts/awsome_sampling.py"), "file://d:/processing_scripts/awsome_processing.py")

        self.cleanup_files.append(dataset2.data_source.processing_script)
        #        self.cleanup_files.push(dataset3.sampling.script)
        #        self.cleanup_files.push(dataset3.processing_script)

        #       Provisioning admin accepts the submitted project
        work = self.ingester_platform.createUnitOfWork()

        work.post(project_region)  # Save the region

        loc1.region = project_region.id  # Set the datasets location to use the projects region
        work.post(loc1)  # Save the location
        dataset1.location = loc1.id  # Set the datasets location
        work.post(dataset1)  # Save the dataset

        loc2.region = project_region.id
        work.post(loc2)
        dataset2.location = loc2.id
        work.post(dataset2)

        work.commit()

        # Region, location and dataset id's will be saved to the project within the provisioning system in some way

        #       User searches for datasets

        # TODO: Nigel? - Define searching api
        found_dataset_id = dataset1.id  # The dataset that has an extended file schema

        #       User manually enters data
        timestamp = datetime.datetime.now()
        data_entry_1 = DataEntry(found_dataset_id, timestamp)
        data_entry_1['temperature'] = 27.8  # Add the extended schema items
        data_entry_1 = self.ingester_platform.post(data_entry_1)
        self.assertIsNotNone(data_entry_1.id)

        timestamp2 = timestamp + datetime.timedelta(seconds=1)
        data_entry_2 = DataEntry(found_dataset_id, timestamp2)
        data_entry_2['temperature'] = 27.8  # Add the extended schema items
        data_entry_2 = self.ingester_platform.post(data_entry_2)

        self.assertEquals(
            2,
            len(
                self.ingester_platform.search(
                    DataEntrySearchCriteria(found_dataset_id), 0, 10).results))
        result = self.ingester_platform.search(
            DataEntrySearchCriteria(found_dataset_id), 0, 1)
        self.assertEquals(2, result.count)
        self.assertEquals(1, len(result.results))
        self.assertEquals(
            1,
            len(
                self.ingester_platform.search(
                    DataEntrySearchCriteria(found_dataset_id), 1, 1).results))

        result = self.ingester_platform.search(
            DataEntrySearchCriteria(found_dataset_id), 2, 1)
        self.assertEquals(0, len(result.results))

        self.assertEquals(
            0,
            len(
                self.ingester_platform.search(
                    DataEntrySearchCriteria(found_dataset_id,
                                            end_time=timestamp -
                                            datetime.timedelta(seconds=60)), 0,
                    10).results))
        self.assertEquals(
            0,
            len(
                self.ingester_platform.search(
                    DataEntrySearchCriteria(found_dataset_id,
                                            start_time=timestamp +
                                            datetime.timedelta(seconds=60)), 0,
                    10).results))
        self.assertEquals(
            2,
            len(
                self.ingester_platform.search(
                    DataEntrySearchCriteria(
                        found_dataset_id,
                        start_time=timestamp - datetime.timedelta(seconds=60),
                        end_time=timestamp + datetime.timedelta(seconds=60)),
                    0, 10).results))

        work = self.ingester_platform.createUnitOfWork()
        data_entry_3 = DataEntry(dataset2.id, datetime.datetime.now())
        data_entry_3['file'] = FileObject(f_handle=open(
            os.path.join(os.path.dirname(jcudc24ingesterapi.__file__),
                         "tests/test_ingest.xml")),
                                          mime_type="text/xml")
        work.post(data_entry_3)
        work.commit()
        self.assertIsNotNone(data_entry_3.id)

        f_in = self.ingester_platform.getDataEntryStream(
            dataset2.id, data_entry_3.id, "file")
        self.assertIsNotNone(f_in)
        data = f_in.read()
        f_in.close()
        self.assertLess(0, len(data), "Expected data in file")

        #       User enters quality assurance metadata
        quality_metadata_schema = DatasetMetadataSchema()
        quality_metadata_schema.addAttr(String("unit"))
        quality_metadata_schema.addAttr(String("description"))
        quality_metadata_schema.addAttr(Double("value"))
        quality_metadata_schema = self.ingester_platform.post(
            quality_metadata_schema)

        entered_metadata = DatasetMetadataEntry(data_entry_1.dataset,
                                                quality_metadata_schema.id)
        entered_metadata['unit'] = "%"
        entered_metadata['description'] = "Percent error"
        entered_metadata['value'] = 0.98

        entered_metadata = self.ingester_platform.post(entered_metadata)

        # Now find that metadata
        results = self.ingester_platform.search(
            DatasetMetadataSearchCriteria(data_entry_1.dataset), 0, 10).results
        self.assertEqual(1, len(results))

        data_entry_md_schema = DataEntryMetadataSchema("test")
        data_entry_md_schema.addAttr(String("description"))
        data_entry_md_schema.addAttr(Double("value"))
        data_entry_md_schema = self.ingester_platform.post(
            data_entry_md_schema)
        calibration = DataEntryMetadataEntry(metadata_schema_id=int(
            data_entry_md_schema.id),
                                             dataset_id=dataset2.id,
                                             object_id=data_entry_3.id)
        calibration["description"] = "Test"
        calibration["value"] = 1.2

        calibration2 = DataEntryMetadataEntry(metadata_schema_id=int(
            data_entry_md_schema.id),
                                              dataset_id=dataset2.id,
                                              object_id=data_entry_3.id)
        calibration2["description"] = "Test2"
        calibration2["value"] = 2.3
        calibration2 = self.ingester_platform.post(calibration2)

        calibrations = self.ingester_platform.search(
            DataEntryMetadataSearchCriteria(int(81), int(3648)),
            offset=0,
            limit=1000)
        self.assertEquals(1, len(calibrations.results))
        self.assertEquals(calibrations.results[0].schema_id,
                          data_entry_md_schema.id)

        self.ingester_platform.delete(calibration2)
        self.ingester_platform.delete(calibration)
        self.ingester_platform.delete(data_entry_md_schema)

        #       User changes sampling rate
        # FIXME: This test is going to be changed to be done by editing the dataset
        #        sampling_rate_changed = Metadata(dataset1.id, type(dataset1), SampleRateMetadataSchema())
        #        sampling_rate_changed.change_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
        #        sampling_rate_changed.sampling = CustomSampling("file://d:/sampling_scripts/awsome_sampling.py")
        #
        #        try:
        #            sampling_rate_changed = self.ingester_platform.post(sampling_rate_changed)
        #            assert(sampling_rate_changed.metadata_id is None, "Sampling rate change failed")
        #        except:
        #            assert(True, "Sampling rate change failed")

        #       User wants some random metadata specific to their project
        # FIXME: Not sure what use case this is trying to demonstrate
        #        random_metadata_schema =  DataEntryMetadataSchema()
        #        random_metadata_schema.addAttr('random_field', Double())

        #        random_metadata = Metadata(data_entry.data_entry_id, type(data_entry), random_metadata_schema)
        #        random_metadata.random_field = 1.5

        #        try:
        #            random_metadata = self.ingester_platform.post(random_metadata)
        #            assert(random_metadata.metadata_id is None, "random_metadata failed")
        #        except:
        #            assert(True, "random_metadata failed")

        #       User changes the data source of the dataset
        new_data_source = PullDataSource("http://test.com/new_data",
                                         "file_handle")
        dataset1.data_source = new_data_source
        dataset1 = self.ingester_platform.post(dataset1)
        self.assertNotEqual(None, dataset1)

        #       External, 3rd party searches for data
        # TODO: external 3rd parties should be able to use the api to get data without authentication
        # TODO: I'm not sure exactly how this should work, but the search api could be open access (need spam limitations or something?)

        #       Project is disabled/finished
        work = self.ingester_platform.createUnitOfWork()
        work.disable(dataset1.id)
        work.disable(dataset2.id)
        work.commit()

        #       Project is obsolete and data should be deleted
        work = self.ingester_platform.createUnitOfWork()
        work.delete(dataset1.id)
        work.delete(dataset2.id)
        work.commit()
Esempio n. 14
0
    def fetch_http(self, cwd):
        """Recursively fetch from an HTTP server.
        """
        RE_A = re.compile("href=\"(\./){0,1}([0-9A-Za-z\-_\.\:]+)\"")
        req = urllib2.Request(self.url)
        ret = []

        since = None
        if "lasttime" in self.state and self.state["lasttime"] != None and len(
                self.state["lasttime"]) > 0:
            since = eut.formatdate(calendar.timegm(
                parse_timestamp(self.state["lasttime"]).timetuple()),
                                   usegmt=True)

        f_in = None
        try:
            f_index = urllib2.urlopen(req)
            index_page = f_index.read()
            f_index.close()
            urls = RE_A.findall(index_page)
            found = 0

            RE_FILENAME = None if self.pattern == None else re.compile(
                self.pattern)
            for url_part in urls:
                if RE_FILENAME != None and RE_FILENAME.match(
                        url_part[1]) == None:
                    continue

                url = urlparse.urljoin(self.url, url_part[0] + url_part[1])
                req = urllib2.Request(url)
                if since != None: req.add_header("If-Modified-Since", since)
                try:
                    f_in = urllib2.urlopen(req)
                    f_out_name = os.path.join(cwd, "outputfile%d" % found)
                    timestamp = parse_timestamp_rfc_2822(
                        f_in.headers["Last-Modified"])
                    with file(f_out_name, "wb") as f_out:
                        shutil.copyfileobj(f_in, f_out)
                    new_data_entry = DataEntry(timestamp=timestamp)
                    file_name = None
                    try:
                        file_name = url_part[1].split("/")[-1]
                    except:
                        pass
                    new_data_entry[self.field] = FileObject(
                        f_path="outputfile%d" % found,
                        mime_type="",
                        file_name=file_name)
                    ret.append(new_data_entry)
                    found += 1

                    if since == None or timestamp > since:
                        since = timestamp

                except urllib2.HTTPError, e:
                    if e.code == 304:
                        continue
        finally:
            if f_in != None: f_in.close()

        self.state["lasttime"] = format_timestamp(
            since) if since != None else None
        return ret