def _copy_attrs(self, attrs, dst): for attr in attrs: if "size" in attr: fo = FileObject() fo.f_name = attr["name"] fo.mime_type = attr["mimeType"] fo.file_name = attr["originalFileName"] dst.data[attr["name"]] = fo else: dst.data[attr["name"]] = attr["value"]
def persist_data_entry_metadata(self, data_entry, schema, attrs, cwd): # Check the attributes are actually in the schema self.validate_schema(attrs, schema.attrs) s = orm.sessionmaker(bind=self.engine)() try: md = DataEntryMetadata() md.data_entry = data_entry.id md.schema = schema.id s.add(md) s.flush() # Copy all files into place self.copy_files(attrs, schema.attrs, cwd, md, "data_entry_metadata") merge_parameters(md.attrs, attrs, DataEntryMetadataAttr) s.merge(md) s.flush() s.commit() entry = DataEntryMetadataEntry(object_id=md.data_entry, metadata_schema_id=md.schema, id=md.id) for attr in md.attrs: if isinstance(schema.attrs[attr.name], FileDataType): entry[attr.name] = FileObject(f_path=attr.value) else: entry[attr.name] = attr.value return entry finally: s.close()
def fetch_observations(self, sos, caps, cwd, ret): insert_dir = os.path.join(cwd, "observations") if not os.path.exists(insert_dir): os.makedirs(insert_dir) for observationID in caps.createRangeGenerator(): if observationID not in self.state['observations']: logger.debug("GetObservationByID for %s" % observationID) sos_obs = sos.getObservationByID(observationID, "om:Observation") obs_path = os.path.join(insert_dir, "%s.xml" % observationID) with open(obs_path, "wb") as output: output.write(sos_obs.getXMLString()) timestamp = sos_obs.getTimestamp() new_data_entry = DataEntry(timestamp=timestamp) new_data_entry[self.field] = FileObject( f_path=obs_path, mime_type=SOSMimeTypes.om_1_0_0) ret.append(new_data_entry) self.state['observations'].append(observationID) self.state['observation_map'][sos_obs.getSensorID()].append( observationID) else: logger.debug( "GetObservationByID for %s already retrieved, ignoring." % observationID)
def fetch_single(self, cwd): """Fetch a single resource from a URL""" req = urllib2.Request(self.url) f_out_name = os.path.join(cwd, "outputfile") f_in = None try: f_in = urllib2.urlopen(req) timestamp = parse_timestamp_rfc_2822(f_in.headers["Last-Modified"]) if "Last-Modified" in f_in.headers \ else datetime.datetime.now() with file(f_out_name, "wb") as f_out: shutil.copyfileobj(f_in, f_out) self.state["lasttime"] = format_timestamp(timestamp) finally: if f_in != None: f_in.close() new_data_entry = DataEntry(timestamp=timestamp) file_name = None try: file_name = self.url.split("/")[-1] except: pass new_data_entry[self.field] = FileObject(f_path="outputfile", mime_type="", file_name=file_name) return [new_data_entry]
def fetch(self, cwd, service=None): with open(os.path.join(cwd, "file1"), "w") as f: f.write("2,55\n3,2\n") data_entry = DataEntry(timestamp=datetime.datetime.now()) data_entry["file1"] = FileObject("file1") return [data_entry]
def testScript(self): file1 = "1\n2\n" with open(os.path.join(self.cwd, "file1"), "w") as f: f.write(file1) data_entry = DataEntry(timestamp=datetime.datetime.now()) data_entry["file1"] = FileObject("file1") script = """def process(cwd, data_entry): return [data_entry, None, None] """ new_entries = run_script(script, self.cwd, data_entry) self.assertEquals(3, len(new_entries))
def _create_data_entry(self, obs, schema): """Internal method for creating the DataEntry domain object from a database observation """ entry = DataEntry() entry.dataset = obs.dataset entry.id = obs.id entry.timestamp = obs.timestamp for attr in obs.attrs: if isinstance(schema.attrs[attr.name], FileDataType): entry[attr.name] = FileObject(f_path=attr.value) else: entry[attr.name] = attr.value return entry
def test_file_object_roundtrip(self): """The file object should marshall everything but the file stream""" data_entry = DataEntry(1) data_entry["temp"] = FileObject(f_path=os.path.join( os.path.dirname(jcudc24ingesterapi.__file__), "tests/test_ingest.xml"), mime_type="text/xml") data_entry_dto = self.marshaller.obj_to_dict(data_entry) self.assertEqual("text/xml", data_entry_dto["data"]["temp"]["mime_type"]) data_entry_domain = self.marshaller.dict_to_obj(data_entry_dto) self.assertEqual("text/xml", data_entry_domain["temp"].mime_type)
def _create_data_entry_metadata(self, session, obj): """Internal method for creating the DataEntry domain object from a database observation """ schema = ConcreteSchema(self.service.get_schema_tree(obj.schema)) entry = DataEntryMetadataEntry() entry.metadata_schema = obj.schema entry.id = obj.id entry.object_id = obj.data_entry for attr in obj.attrs: if isinstance(schema.attrs[attr.name], FileDataType): entry[attr.name] = FileObject(f_path=attr.value) else: entry[attr.name] = attr.value return entry
def fetch(self, cwd, service=None): """Scans a folder to find new files. The filenames are UTC timestamps that used as the timestamp for these samples. :param cwd: working directory to place binary data :returns: dict containing the data to be ingested """ if not hasattr(self, "path"): raise DataSourceError("Path not set") if not os.path.exists(self.path): raise DataSourceError("Could not find the staging path") start_time = datetime.datetime.utcnow() # When the file should have been modified since since = None if "lasttime" in self.state and self.state["lasttime"] != None and len( self.state["lasttime"]) > 0: since = calendar.timegm( parse_timestamp(self.state["lasttime"]).timetuple()) ret = [] for f_name in os.listdir(self.path): timestamp = self.match_filename(f_name) if timestamp == None: continue logger.debug("%s %s" % (str(timestamp), f_name)) new_filename = "file-" + f_name if self.archive != None: shutil.copyfile(os.path.join(self.path, f_name), os.path.join(self.archive, f_name)) shutil.move(os.path.join(self.path, f_name), os.path.join(cwd, new_filename)) #timestamp = datetime.datetime.utcfromtimestamp(int(m.group(1))) new_data_entry = DataEntry(timestamp=timestamp) new_data_entry[self.field] = FileObject(f_path=new_filename, file_name=f_name, mime_type="") ret.append(new_data_entry) self.state["lasttime"] = format_timestamp( since) if since != None else None return ret
def fetch_sensorml(self, sos, caps, cwd, ret): sensorIDS = caps.getSensorIDs() sensorml_dir = os.path.join(cwd, "sensorml") if not os.path.exists(sensorml_dir): os.makedirs(sensorml_dir) for sensorID in sensorIDS: if sensorID not in self.state['sensorml']: logger.debug("Getting SensorML for %s" % sensorID) sml = sos.describeSensor(sensorID) sml_path = os.path.join(sensorml_dir, sensorID) with open(sml_path, "wb") as sensorml: sensorml.write(sml.getXMLString()) timestamp = datetime.datetime.now() new_data_entry = DataEntry(timestamp=timestamp) new_data_entry[self.field] = FileObject( f_path=sml_path, mime_type=SOSMimeTypes.sensorML_1_0_1) ret.append(new_data_entry) self.state['sensorml'].append(sensorID) else: logger.debug("SensorML for %s already exists, ignoring." % sensorID)
def test_api_usage(self): # User data that is created by filling out the provisioning interface workflow steps. # General title = "Test project" data_manager = "A Person" project_lead = "Another Person" # Metadata project_region = Region("Test Region", ((1, 1), (2, 2), (2, 1), (1, 1))) # Methods & Datasets loc1 = Location(11.0, 11.0, "Test Site", 100) loc2 = Location(11.0, 11.0, "Test Site", 100) loc3 = Location(12.0, 11.0, "Test Site", 100) temp_work = self.ingester_platform.createUnitOfWork() temperature_schema = DataEntrySchema("Test Temp Schema") temperature_schema.addAttr(Double("temperature")) temp_work.post(temperature_schema) temp_work.commit() air_temperature_schema = DataEntrySchema("Air Temp Schema") air_temperature_schema.extends = [temperature_schema.id] air_temperature_schema = self.ingester_platform.post( air_temperature_schema) second_level_inheritence_schema = DataEntrySchema("Second Inheritence") second_level_inheritence_schema.extends = [air_temperature_schema.id] second_level_inheritence_schema = self.ingester_platform.post( second_level_inheritence_schema) # Check the name is set temperature_schema_1 = self.ingester_platform.getSchema( temperature_schema.id) self.assertIsNotNone(temperature_schema.name) self.assertEquals(temperature_schema.name, temperature_schema_1.name) file_schema = DataEntrySchema() file_schema.addAttr(FileDataType("file")) file_schema = self.ingester_platform.post(file_schema) dataset1 = Dataset(location=None, schema=temperature_schema.id) dataset2 = Dataset( location=None, schema=file_schema.id, data_source=PullDataSource( "http://test.com", "file_handle", processing_script= "file://d:/processing_scripts/awsome_processing.py")) # dataset3 = Dataset(None, file_schema, PullDataSource("http://test.com", "file_handle"), CustomSampling("file://d:/sampling_scripts/awsome_sampling.py"), "file://d:/processing_scripts/awsome_processing.py") self.cleanup_files.append(dataset2.data_source.processing_script) # self.cleanup_files.push(dataset3.sampling.script) # self.cleanup_files.push(dataset3.processing_script) # Provisioning admin accepts the submitted project work = self.ingester_platform.createUnitOfWork() work.post(project_region) # Save the region loc1.region = project_region.id # Set the datasets location to use the projects region work.post(loc1) # Save the location dataset1.location = loc1.id # Set the datasets location work.post(dataset1) # Save the dataset loc2.region = project_region.id work.post(loc2) dataset2.location = loc2.id work.post(dataset2) work.commit() # Region, location and dataset id's will be saved to the project within the provisioning system in some way # User searches for datasets # TODO: Nigel? - Define searching api found_dataset_id = dataset1.id # The dataset that has an extended file schema # User manually enters data timestamp = datetime.datetime.now() data_entry_1 = DataEntry(found_dataset_id, timestamp) data_entry_1['temperature'] = 27.8 # Add the extended schema items data_entry_1 = self.ingester_platform.post(data_entry_1) self.assertIsNotNone(data_entry_1.id) timestamp2 = timestamp + datetime.timedelta(seconds=1) data_entry_2 = DataEntry(found_dataset_id, timestamp2) data_entry_2['temperature'] = 27.8 # Add the extended schema items data_entry_2 = self.ingester_platform.post(data_entry_2) self.assertEquals( 2, len( self.ingester_platform.search( DataEntrySearchCriteria(found_dataset_id), 0, 10).results)) result = self.ingester_platform.search( DataEntrySearchCriteria(found_dataset_id), 0, 1) self.assertEquals(2, result.count) self.assertEquals(1, len(result.results)) self.assertEquals( 1, len( self.ingester_platform.search( DataEntrySearchCriteria(found_dataset_id), 1, 1).results)) result = self.ingester_platform.search( DataEntrySearchCriteria(found_dataset_id), 2, 1) self.assertEquals(0, len(result.results)) self.assertEquals( 0, len( self.ingester_platform.search( DataEntrySearchCriteria(found_dataset_id, end_time=timestamp - datetime.timedelta(seconds=60)), 0, 10).results)) self.assertEquals( 0, len( self.ingester_platform.search( DataEntrySearchCriteria(found_dataset_id, start_time=timestamp + datetime.timedelta(seconds=60)), 0, 10).results)) self.assertEquals( 2, len( self.ingester_platform.search( DataEntrySearchCriteria( found_dataset_id, start_time=timestamp - datetime.timedelta(seconds=60), end_time=timestamp + datetime.timedelta(seconds=60)), 0, 10).results)) work = self.ingester_platform.createUnitOfWork() data_entry_3 = DataEntry(dataset2.id, datetime.datetime.now()) data_entry_3['file'] = FileObject(f_handle=open( os.path.join(os.path.dirname(jcudc24ingesterapi.__file__), "tests/test_ingest.xml")), mime_type="text/xml") work.post(data_entry_3) work.commit() self.assertIsNotNone(data_entry_3.id) f_in = self.ingester_platform.getDataEntryStream( dataset2.id, data_entry_3.id, "file") self.assertIsNotNone(f_in) data = f_in.read() f_in.close() self.assertLess(0, len(data), "Expected data in file") # User enters quality assurance metadata quality_metadata_schema = DatasetMetadataSchema() quality_metadata_schema.addAttr(String("unit")) quality_metadata_schema.addAttr(String("description")) quality_metadata_schema.addAttr(Double("value")) quality_metadata_schema = self.ingester_platform.post( quality_metadata_schema) entered_metadata = DatasetMetadataEntry(data_entry_1.dataset, quality_metadata_schema.id) entered_metadata['unit'] = "%" entered_metadata['description'] = "Percent error" entered_metadata['value'] = 0.98 entered_metadata = self.ingester_platform.post(entered_metadata) # Now find that metadata results = self.ingester_platform.search( DatasetMetadataSearchCriteria(data_entry_1.dataset), 0, 10).results self.assertEqual(1, len(results)) data_entry_md_schema = DataEntryMetadataSchema("test") data_entry_md_schema.addAttr(String("description")) data_entry_md_schema.addAttr(Double("value")) data_entry_md_schema = self.ingester_platform.post( data_entry_md_schema) calibration = DataEntryMetadataEntry(metadata_schema_id=int( data_entry_md_schema.id), dataset_id=dataset2.id, object_id=data_entry_3.id) calibration["description"] = "Test" calibration["value"] = 1.2 calibration2 = DataEntryMetadataEntry(metadata_schema_id=int( data_entry_md_schema.id), dataset_id=dataset2.id, object_id=data_entry_3.id) calibration2["description"] = "Test2" calibration2["value"] = 2.3 calibration2 = self.ingester_platform.post(calibration2) calibrations = self.ingester_platform.search( DataEntryMetadataSearchCriteria(int(81), int(3648)), offset=0, limit=1000) self.assertEquals(1, len(calibrations.results)) self.assertEquals(calibrations.results[0].schema_id, data_entry_md_schema.id) self.ingester_platform.delete(calibration2) self.ingester_platform.delete(calibration) self.ingester_platform.delete(data_entry_md_schema) # User changes sampling rate # FIXME: This test is going to be changed to be done by editing the dataset # sampling_rate_changed = Metadata(dataset1.id, type(dataset1), SampleRateMetadataSchema()) # sampling_rate_changed.change_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") # sampling_rate_changed.sampling = CustomSampling("file://d:/sampling_scripts/awsome_sampling.py") # # try: # sampling_rate_changed = self.ingester_platform.post(sampling_rate_changed) # assert(sampling_rate_changed.metadata_id is None, "Sampling rate change failed") # except: # assert(True, "Sampling rate change failed") # User wants some random metadata specific to their project # FIXME: Not sure what use case this is trying to demonstrate # random_metadata_schema = DataEntryMetadataSchema() # random_metadata_schema.addAttr('random_field', Double()) # random_metadata = Metadata(data_entry.data_entry_id, type(data_entry), random_metadata_schema) # random_metadata.random_field = 1.5 # try: # random_metadata = self.ingester_platform.post(random_metadata) # assert(random_metadata.metadata_id is None, "random_metadata failed") # except: # assert(True, "random_metadata failed") # User changes the data source of the dataset new_data_source = PullDataSource("http://test.com/new_data", "file_handle") dataset1.data_source = new_data_source dataset1 = self.ingester_platform.post(dataset1) self.assertNotEqual(None, dataset1) # External, 3rd party searches for data # TODO: external 3rd parties should be able to use the api to get data without authentication # TODO: I'm not sure exactly how this should work, but the search api could be open access (need spam limitations or something?) # Project is disabled/finished work = self.ingester_platform.createUnitOfWork() work.disable(dataset1.id) work.disable(dataset2.id) work.commit() # Project is obsolete and data should be deleted work = self.ingester_platform.createUnitOfWork() work.delete(dataset1.id) work.delete(dataset2.id) work.commit()
def fetch_http(self, cwd): """Recursively fetch from an HTTP server. """ RE_A = re.compile("href=\"(\./){0,1}([0-9A-Za-z\-_\.\:]+)\"") req = urllib2.Request(self.url) ret = [] since = None if "lasttime" in self.state and self.state["lasttime"] != None and len( self.state["lasttime"]) > 0: since = eut.formatdate(calendar.timegm( parse_timestamp(self.state["lasttime"]).timetuple()), usegmt=True) f_in = None try: f_index = urllib2.urlopen(req) index_page = f_index.read() f_index.close() urls = RE_A.findall(index_page) found = 0 RE_FILENAME = None if self.pattern == None else re.compile( self.pattern) for url_part in urls: if RE_FILENAME != None and RE_FILENAME.match( url_part[1]) == None: continue url = urlparse.urljoin(self.url, url_part[0] + url_part[1]) req = urllib2.Request(url) if since != None: req.add_header("If-Modified-Since", since) try: f_in = urllib2.urlopen(req) f_out_name = os.path.join(cwd, "outputfile%d" % found) timestamp = parse_timestamp_rfc_2822( f_in.headers["Last-Modified"]) with file(f_out_name, "wb") as f_out: shutil.copyfileobj(f_in, f_out) new_data_entry = DataEntry(timestamp=timestamp) file_name = None try: file_name = url_part[1].split("/")[-1] except: pass new_data_entry[self.field] = FileObject( f_path="outputfile%d" % found, mime_type="", file_name=file_name) ret.append(new_data_entry) found += 1 if since == None or timestamp > since: since = timestamp except urllib2.HTTPError, e: if e.code == 304: continue finally: if f_in != None: f_in.close() self.state["lasttime"] = format_timestamp( since) if since != None else None return ret