def _execute(self, sources, alignment_stream, interval): max_interval = TimeInterval(MIN_DATE, interval.end) exp_list = {} for timestamp, value in sources[0].window(max_interval, force_calculation=True): if value['tier'] != "Experiment": continue d = deepcopy(value) mongo_id = d.pop('_id') trigger = d.pop('trigger') if trigger == 1: u = {'start': timestamp} else: u = {'end': timestamp} if mongo_id in exp_list: if u.keys()[0] in exp_list[mongo_id]: raise ValueError("Duplicate {} triggers found for timestamp {}".format(trigger, timestamp)) exp_list[mongo_id].update(u) else: d.update(u) exp_list[mongo_id] = d for i, doc in enumerate(exp_list.values()): if TimeInterval(doc['start'], doc['end']) in max_interval: yield StreamInstance(doc['end'], doc)
def run(house, wearables, delete_existing_workflows=True, loglevel=logging.INFO): from hyperstream import HyperStream, TimeInterval, StreamNotFoundError from workflows.asset_splitter import split_sphere_assets from workflows.deploy_localisation_model import create_workflow_localisation_predict # from workflows.deploy_localisation_model_new_api import create_workflow_localisation_predict hyperstream = HyperStream(loglevel=loglevel, file_logger=None) D = hyperstream.channel_manager.mongo A = hyperstream.channel_manager.assets experiment_ids = A.find_stream(name="experiments_selected", house=house).window( TimeInterval.up_to_now()).last().value experiment_ids_str = '_'.join(experiment_ids) workflow_id0 = "asset_splitter" workflow_id1 = "lda_localisation_model_predict_"+experiment_ids_str if delete_existing_workflows: hyperstream.workflow_manager.delete_workflow(workflow_id0) hyperstream.workflow_manager.delete_workflow(workflow_id1) split_sphere_assets(hyperstream, house) try: w = hyperstream.workflow_manager.workflows[workflow_id1] except KeyError: w = create_workflow_localisation_predict(hyperstream, house=house, experiment_ids=experiment_ids, safe=False) hyperstream.workflow_manager.commit_workflow(workflow_id1) # def safe_purge(channel, stream_id): # try: # channel.purge_stream(stream_id) # except StreamNotFoundError: # pass # A.purge_node("wearables_by_house") # A.purge_node("access_points_by_house") # D.purge_node("predicted_locations_broadcasted") # for h in [1, 2, 1176, 1116]: # safe_purge(A, StreamId(name="wearables_by_house", meta_data=(('house', h),))) # safe_purge(A, StreamId(name="access_points_by_house", meta_data=(('house', h),))) # for w in wearables: # safe_purge(D, StreamId(name="predicted_locations_broadcasted", meta_data=(('house', h), ('wearable', w)))) ti0 = TimeInterval.up_to_now() ti1 = TimeInterval.now_minus(minutes=1) # ti0 = TimeInterval(MIN_DATE, parse("2016-12-02 17:14:25.075Z")) # ti1 = TimeInterval(start=ti0.end - timedelta(minutes=1), end=ti0.end) w.execute(ti1) print('number of non_empty_streams: {}'.format( len(hyperstream.channel_manager.memory.non_empty_streams))) from display_localisation_predictions import display_predictions display_predictions(hyperstream, ti1, house, wearables=wearables)
def test_time_interval(self): i1 = TimeIntervals([ TimeInterval(now, now + hour), TimeInterval(now + 2 * hour, now + 3 * hour), ]) i2 = TimeIntervals([ TimeInterval(now + 30 * minute, now + 30 * minute + 2 * hour), ]) # print(i1) assert (i1 == TimeIntervals(intervals=[TimeInterval(start=datetime(2016, 1, 1, 0, 0), end=datetime(2016, 1, 1, 1, 0)), TimeInterval(start=datetime(2016, 1, 1, 2, 0), end=datetime(2016, 1, 1, 3, 0))])) # print(i2) # print() s = i1 + i2 assert (s == TimeIntervals(intervals=[TimeInterval(start=datetime(2016, 1, 1, 0, 0), end=datetime(2016, 1, 1, 3, 0))])) d = i1 - i2 assert (d == TimeIntervals(intervals=[TimeInterval(start=datetime(2016, 1, 1, 0, 0), end=datetime(2016, 1, 1, 0, 30)), TimeInterval(start=datetime(2016, 1, 1, 2, 30), end=datetime(2016, 1, 1, 3, 0))]))
def _execute(self, source, splitting_stream, interval, output_plate): raise NotImplementedError # the development of this tool has not been finished if splitting_stream is None: raise ValueError("Splitting stream required for this tool") if isinstance(splitting_stream, AssetStream): time_interval = TimeInterval(MIN_DATE, interval.end) splitter = splitting_stream.window(time_interval, force_calculation=True).last() else: splitter = splitting_stream.window(interval, force_calculation=True).last() if not splitter: logging.debug( "No assets found for source {} and splitter {}".format( source.stream_id, splitting_stream.stream_id)) return mapping = splitter.value for timestamp, value in source.window(interval, force_calculation=True): for key in value.keys(): if key not in mapping: logging.warn( "Unknown value {} for meta data in SplitterOfDictFromStream" .format(key)) continue plate_value = mapping[key] yield StreamMetaInstance( (timestamp, value[key]), (output_plate.meta_data_id, plate_value))
def test_tool_channel_new_api(self): with HyperStream(file_logger=False, console_logger=False, mqtt_logger=None) as hs: M = hs.channel_manager.memory # new way of loading tools clock_new = hs.tools.clock() # old way of loading tools clock_old = hs.channel_manager.tools["clock"].window( (MIN_DATE, utcnow())).last().value() # TODO: NOTE THAT IF WE DO IT THE OLD WAY FIRST, THEN THE NEW WAY FAILS WITH: # TypeError: super(type, obj): obj must be an instance or subtype of type # which possibly relates to: # https://stackoverflow.com/questions/9722343/python-super-behavior-not-dependable ticker_old = M.get_or_create_stream("ticker_old") ticker_new = M.get_or_create_stream("ticker_new") now = utcnow() before = (now - timedelta(seconds=30)).replace(tzinfo=UTC) ti = TimeInterval(before, now) clock_old.execute(sources=[], sink=ticker_old, interval=ti) clock_new.execute(sources=[], sink=ticker_new, interval=ti) self.assertListEqual(ticker_old.window().values(), ticker_new.window().values())
def test_simple_workflow(self): # Create a simple one step workflow for querying w = hyperstream.create_workflow( workflow_id="simple_query_workflow", name="Simple query workflow", owner="TD", description="Just a test of creating workflows") time_interval = TimeInterval(t1, t1 + 1 * minute) # Create some streams (collected in a node) node = w.create_node(stream_name="environmental", channel=S, plate_ids=["H1" ]) # .window((t1, t1 + 1 * minute)) # Create a factor to produce some data w.create_multi_output_factor(tool=dict( name="sphere", parameters=dict(modality="environmental")), source=None, splitting_node=None, sink=node) # Execute the workflow w.execute(time_interval) # Check the values assert (node.streams[('house', '1'), ].window(time_interval).first().value == { u'electricity': 0.0, 'uid': u'04063' })
def test_basic_aggregator(self): """ An average of RSS values per location """ # access_points_by_wearable_and_house().execute(TimeInterval.up_to_now()) w = basic_workflow(sys._getframe().f_code.co_name) N = w.nodes w.create_factor( tool=channels.get_tool( name="aggregate", parameters=dict(func=online_average, aggregation_meta_data="wearable") ), sources=[N["rss"]], sink=N["rss_dev_avg"] ) time_interval = TimeInterval(scripted_experiments[0].start, scripted_experiments[0].start + 2 * minute) w.execute(time_interval) print_head(w, "rss", h1 + wA, locs, time_interval, 10, print) print_head(w, "rss_dev_avg", h1, locs, time_interval, 10, print) assert all(list(N["rss_dev_avg"].streams[k].window(time_interval).head(10)) == v for k, v in RSS_DEV_AVG.items())
def test_off_plate_aggregator(self): """ This is a test for aggregation where the aggregate is up the tree, but the destination plate is not in the ancestry. For example: source plate: H1.L.W aggregate: L destination: H1.W Note here that H1.W is not an ancestor of H1.L.W (only H1 and H1.L are), so we have to figure out that H1.W is a valid destination, based on the fact that all but one of the meta data id's are shared. """ w = basic_workflow(sys._getframe().f_code.co_name) aggregate_loc = channels.get_tool( name="aggregate", parameters=dict(func=online_average, aggregation_meta_data="location") ) N = w.nodes w.create_factor( tool=aggregate_loc, sources=[N["rss"]], sink=N["rss_loc_avg"] ) time_interval = TimeInterval(scripted_experiments[0].start, scripted_experiments[0].start + 2 * minute) w.execute(time_interval) print_head(w, "rss", h1 + wA, locs, time_interval, 10, print) print_head(w, "rss_loc_avg", h1, wA, time_interval, 10, print) assert all(list(N["rss_loc_avg"].streams[k].window(time_interval).head(10)) == v for k, v in RSS_LOC_AVG.items())
def test_index_of_by_stream(self): w = basic_workflow(sys._getframe().f_code.co_name) aggregate_loc = channels.get_tool( name="index_of_by_stream", parameters=dict(index="kitchen") ) # Create a stream with the single value "location" in it w.create_node(stream_name="selector_meta_data", channel=A, plate_ids=None) A.write_to_stream(stream_id=StreamId(name="selector_meta_data"), data=StreamInstance(timestamp=utcnow(), value="location")) N = w.nodes w.create_factor( tool=aggregate_loc, sources=[N["selector_meta_data"], N["rss"]], sink=N["rss_kitchen"] ) time_interval = TimeInterval(scripted_experiments[0].start, scripted_experiments[0].start + 2 * minute) w.execute(time_interval) key = h1 + (('location', 'kitchen'),) + wA assert all(a == b for a, b in zip(N['rss_kitchen'].streams[h1 + wA].window(time_interval).head(10), N['rss'].streams[key].window(time_interval).head(10)))
def test_save_workflow(self): workflow_id = sys._getframe().f_code.co_name # hyperstream.logger.setLevel(logging.WARN) # First delete the workflow if it's there hyperstream.workflow_manager.delete_workflow(workflow_id) w1 = basic_workflow(workflow_id) time_interval = TimeInterval( scripted_experiments[0].start, scripted_experiments[0].start + 2 * minute) w1.execute(time_interval) hyperstream.workflow_manager.commit_workflow(workflow_id) # Now remove it from the workflow manager del hyperstream.workflow_manager.workflows[workflow_id] # And then reload it w2 = hyperstream.workflow_manager.load_workflow(workflow_id) # print_head(w, "rss", h1 + wA, locs, time_interval, 10, print) # print_head(w, "rss_dev_avg", h1, locs, time_interval, 10, print) assert all( list(w1.nodes["rss_dev_avg"].streams[k].window(time_interval).head( 10)) == v for k, v in RSS_DEV_AVG.items()) assert all( list(w2.nodes["rss_dev_avg"].streams[k].window(time_interval).head( 10)) == v for k, v in RSS_DEV_AVG.items())
def _execute(self, source, splitting_stream, interval, output_plate): if splitting_stream is None: raise ValueError("Splitting stream required for this tool") if isinstance(splitting_stream, AssetStream): time_interval = TimeInterval(MIN_DATE, interval.end) splitter = splitting_stream.window(time_interval, force_calculation=True).last() else: splitter = splitting_stream.window(interval, force_calculation=True).last() if not splitter: logging.debug( "No assets found for source {} and splitter {}".format( source.stream_id, splitting_stream.stream_id)) return mapping = splitter.value for timestamp, value in source.window(interval, force_calculation=True): if self.element not in value: logging.debug("Mapping element {} not in instance".format( self.element)) continue value = deepcopy(value) meta_data = str(value.pop(self.element)) if meta_data not in mapping: logging.warn("Unknown value {} for meta data {}".format( meta_data, self.element)) continue plate_value = mapping[meta_data] yield StreamMetaInstance((timestamp, value), (output_plate.meta_data_id, plate_value))
def run(delete_existing_workflows=True, loglevel=logging.INFO): from hyperstream import HyperStream, TimeInterval from workflows.deploy_summariser import create_workflow_coord_plate_creation, create_workflow_summariser from sphere_connector_package.sphere_connector import SphereConnector hyperstream = HyperStream(loglevel=loglevel, file_logger=None) if not globs['sphere_connector']: globs['sphere_connector'] = SphereConnector( config_filename='config.json', include_mongo=True, include_redcap=False, sphere_logger=None) workflow_id = "coord3d_plate_creation" if delete_existing_workflows: hyperstream.workflow_manager.delete_workflow(workflow_id) try: w = hyperstream.workflow_manager.workflows[workflow_id] except KeyError: w = create_workflow_coord_plate_creation(hyperstream, safe=False) hyperstream.workflow_manager.commit_workflow(workflow_id) time_interval = TimeInterval.now_minus(minutes=1) w.execute(time_interval) workflow_id = "periodic_summaries" if delete_existing_workflows: hyperstream.workflow_manager.delete_workflow(workflow_id) try: w = hyperstream.workflow_manager.workflows[workflow_id] except KeyError: w = create_workflow_summariser(hyperstream, env_window_size=1 * 60 * 60.0, rss_window_size=4 * 60 * 60.0, acc_window_size=4 * 60 * 60.0, vid_window_size=4 * 60 * 60.0, pred_window_size=4 * 60 * 60.0, safe=False) hyperstream.workflow_manager.commit_workflow(workflow_id) time_interval = TimeInterval.now_minus(minutes=1) w.execute(time_interval) print('number of non_empty_streams: {}'.format( len(hyperstream.channel_manager.memory.non_empty_streams)))
def _execute(self, source, splitting_stream, interval, output_plate): if splitting_stream is None: raise ValueError("Splitting stream required for this tool") if isinstance(splitting_stream, AssetStream): time_interval = TimeInterval(MIN_DATE, interval.end) splitter = splitting_stream.window(time_interval, force_calculation=True).last() else: splitter = splitting_stream.window(interval, force_calculation=True).last() if not splitter: logging.debug( "No assets found for source {} and splitter {}".format( source.stream_id, splitting_stream.stream_id)) return mapping = splitter.value try: # try if mapping is a dict if len(mapping.keys()) == 0: logging.warn( "The mapping provided to splitter_from_stream by the last element of the splitting stream is empty" ) if self.use_mapping_keys_only: mapping = dict([(x, x) for x in mapping.keys()]) except: # assume that mapping is a list mapping = dict([(x, x) for x in mapping]) for timestamp, value in source.window(interval, force_calculation=True): if self.element is None: for meta_data, sub_value in value.items(): if meta_data not in mapping: logging.warn( "Unexpected splitting value {}".format(meta_data)) continue plate_value = mapping[meta_data] yield StreamMetaInstance( (timestamp, sub_value), (output_plate.meta_data_id, plate_value)) else: if self.element not in value: logging.debug("Mapping element {} not in instance".format( self.element)) continue value = deepcopy(value) meta_data = str(value.pop(self.element)) if meta_data not in mapping: logging.warn("Unknown value {} for meta data {}".format( meta_data, self.element)) continue plate_value = mapping[meta_data] yield StreamMetaInstance( (timestamp, value), (output_plate.meta_data_id, plate_value))
def _execute(self, sources, alignment_stream, interval): sliding_window = sources[0].window(interval, force_calculation=True) result = sliding_window.first() if result is None: return sources[1].window(TimeInterval(interval.start, interval.end), force_calculation=True).first() return yield # required to make this function into a generator
def _execute(self, sources, alignment_stream, interval): data = sources[0].window(interval, force_calculation=True) mappings = [] for x in data: experiment_interval = TimeInterval(x.value['start'], x.value['end']) experiment_id = construct_experiment_id(experiment_interval) if experiment_id in self.experiment_ids: mappings.append((experiment_id, experiment_interval)) yield StreamInstance(interval.end, mappings)
def test_database_channel(self): # Simple querying ti = TimeInterval(t1, t1 + minute) # Get or create the stream that lives in the database env = D.get_or_create_stream( stream_id=StreamId('environmental_db', (("house", "1"), ))) D.purge_stream(env.stream_id) env_tool = channels.get_tool( "sphere", dict(modality="environmental", rename_keys=True, dedupe=True)) env_tool.execute(source=None, splitting_stream=None, sinks=[env], interval=ti, input_plate_value=None, output_plate=hyperstream.plate_manager.plates["H"]) # Create stream whose source will be the above database stream elec = M.create_stream(StreamId('electricity')) env_tool = channels.get_tool( "sphere", dict(modality="environmental", rename_keys=True, dedupe=True)) elec_tool = T[component].window( (MIN_DATE, utcnow())).last().value(key='electricity-04063') env_tool.execute(source=None, splitting_stream=None, sinks=[env], interval=ti, input_plate_value=None, output_plate=hyperstream.plate_manager.plates["H"]) elec_tool.execute(sources=[env], sink=elec, interval=ti, alignment_stream=None) q1 = "\n".join("=".join(map(str, ee)) for ee in elec.window(ti)) # print(q1) # print(edl) assert (q1 == '2016-04-28 20:00:00.159000+00:00=0.0\n' '2016-04-28 20:00:06.570000+00:00=0.0\n' '2016-04-28 20:00:12.732000+00:00=0.0\n' '2016-04-28 20:00:25.125000+00:00=0.0\n' '2016-04-28 20:00:31.405000+00:00=0.0\n' '2016-04-28 20:00:50.132000+00:00=0.0') assert (elec.window(ti).values() == [0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
def main(dataset, classifier, epochs, seed, batchsize): hs = HyperStream(loglevel=30) print(hs) print([p.channel_id_prefix for p in hs.config.plugins]) M = hs.channel_manager.memory data = getattr(datasets, 'load_{}'.format(dataset))() data_tool = hs.plugins.sklearn.tools.dataset(data, shuffle=True, epochs=epochs, seed=seed) data_stream = M.get_or_create_stream('dataset') model = getattr(linear_model, classifier)() classifier_tool = hs.plugins.sklearn.tools.classifier(model) classifier_stream = M.get_or_create_stream('classifier') now = datetime.utcnow().replace(tzinfo=UTC) now = (now - timedelta(hours=1)).replace(tzinfo=UTC) before = datetime.utcfromtimestamp(0).replace(tzinfo=UTC) ti = TimeInterval(before, now) data_tool.execute(sources=[], sink=data_stream, interval=ti) print("Example of a data stream") key, value = data_stream.window().iteritems().next() print('[%s]: %s' % (key, value)) mini_batch_tool = hs.plugins.sklearn.tools.minibatch(batchsize=batchsize) mini_batch_stream = M.get_or_create_stream('mini_batch') mini_batch_tool.execute(sources=[data_stream], sink=mini_batch_stream, interval=ti) classifier_tool.execute(sources=[mini_batch_stream], sink=classifier_stream, interval=ti) scores = [] for key, value in classifier_stream.window(): scores.append(value['score']) # The data is repeated the number of epochs. This makes the mini-batches to # cycle and contain data from the begining and end of the dataset. This # makes possible that the number of scores is not divisible by epochs. if batchsize == 1: print("Test scores per epoch") scores = np.array(scores).reshape(epochs, -1) print(scores.mean(axis=1).round(decimals=2)) else: scores = np.array(scores).reshape(1, -1) print("Test scores per minibatch (cyclic)") print(scores.round(decimals=2))
def run(delete_existing_workflows=True, loglevel=logging.INFO): from hyperstream import HyperStream, TimeInterval from workflows.summaries_to_csv import create_workflow_summaries_to_csv from sphere_connector_package.sphere_connector import SphereConnector if not globs['sphere_connector']: globs['sphere_connector'] = SphereConnector( config_filename='config.json', include_mongo=True, include_redcap=False, sphere_logger=None) hyperstream = HyperStream(loglevel=loglevel, file_logger=None) workflow_id = "summaries_to_csv" if delete_existing_workflows: hyperstream.workflow_manager.delete_workflow(workflow_id) try: w = hyperstream.workflow_manager.workflows[workflow_id] except KeyError: # percentile_results = [] # w = create_workflow_summaries_to_csv(hyperstream,percentile_results=percentile_results,safe=False) w = create_workflow_summaries_to_csv(hyperstream, safe=False) hyperstream.workflow_manager.commit_workflow(workflow_id) day_str = "2016_12_15_23_00" t1 = parse("2016-12-15T19:58:25Z") t2 = parse("2016-12-15T20:01:05Z") t1 = parse("2016-12-15T22:58:25Z") t2 = parse("2016-12-15T23:01:05Z") t1 = parse("2017-02-24T08:01:00Z") t2 = parse("2017-02-24T08:04:00Z") t_1_2 = TimeInterval(start=t1, end=t2) # w.factors[0].execute(t_1_2) w.execute(t_1_2) env_results = w.factors[0].tool.global_result_list csv_string = pd.DataFrame(env_results).to_csv(sep="\t", header=False) with open("mk/visualise_summaries/env_summaries_{}.csv".format(day_str), "w") as text_file: text_file.write(csv_string) # print(env_results) # print(percentile_results) # time_interval = TimeInterval.now_minus(minutes=1) # w.execute(time_interval) print('number of non_empty_streams: {}'.format( len(hyperstream.channel_manager.memory.non_empty_streams)))
def _execute(self, sources, alignment_stream, interval): data = list(sources[0].window(interval, force_calculation=True)) flattened = map(lambda x: dict(dict( experiment_id=construct_experiment_id(TimeInterval(x.value['start'], x.value['end'])), start=x.value['start'], end=x.value['end'], annotator=x.value['annotator'] ), **(x.value['notes'])), data) df = pd.DataFrame(flattened) df['id'] = range(1, len(df) + 1) yield StreamInstance(interval.end, df)
def _execute(self, sources, alignment_stream, interval): sliding_window = sources[0].window(interval, force_calculation=True) first_window = sliding_window.first() if first_window is None: return try: last_window = list(sliding_window)[-1] except: last_window = first_window sources[1].window(TimeInterval(first_window.value.start, last_window.value.end), force_calculation=True).first() return yield # required to make this function into a generator
def main(dataset, model, epochs, seed, batchsize): hs = HyperStream(loglevel=30) print(hs) print([p.channel_id_prefix for p in hs.config.plugins]) M = hs.channel_manager.memory data = getattr(datasets, 'load_{}'.format(dataset))() data_tool = hs.plugins.sklearn.tools.dataset(data, shuffle=True, epochs=epochs, seed=seed) data_stream = M.get_or_create_stream('dataset') anomaly_detector_tool = hs.plugins.sklearn.tools.anomaly_detector(model) anomaly_detector_stream = M.get_or_create_stream('anomaly_detector') now = datetime.utcnow() now = (now - timedelta(hours=1)) before = datetime.utcfromtimestamp(0) ti = TimeInterval(before, now) data_tool.execute(sources=[], sink=data_stream, interval=ti) print("Example of a data stream") key, value = next(iter(data_stream.window())) print('[%s]: %s' % (key, value)) mini_batch_tool = hs.plugins.sklearn.tools.minibatch(batchsize=batchsize) mini_batch_stream = M.get_or_create_stream('mini_batch') mini_batch_tool.execute(sources=[data_stream], sink=mini_batch_stream, interval=ti) anomaly_detector_tool.execute(sources=[mini_batch_stream], sink=anomaly_detector_stream, interval=ti) probas = [] for key, value in anomaly_detector_stream.window(): probas.append(value['proba']) # The data is repeated the number of epochs. This makes the mini-batches to # cycle and contain data from the beginning and end of the dataset. This # makes possible that the number of scores is not divisible by epochs. probas = np.array(probas) print(probas.shape) means = np.array([np.nanmean(aux) for aux in probas]) print(means.shape) print("Test probabilities per minibatch (cyclic)") print(means.round(decimals=2))
def stream(channel, name, meta_data, mimetype, func, parameters=None, start=None, end=None): try: stream = hs.channel_manager[channel].find_stream(name=name, **meta_data) if start and end: ti = TimeInterval(start, end) window = stream.window(ti) else: window = stream.window() except (KeyError, TypeError, MultipleStreamsFoundError, StreamNotFoundError, StreamNotAvailableError) as e: return exception_json( e, dict(channel=channel, name=name, meta_data=meta_data, start=start, end=end)) try: if hasattr(window, func): if parameters: data = getattr(window, func)(*(KNOWN_TYPES[p[0]](p[1]) for p in parameters)) else: data = getattr(window, func)() from collections import deque else: return jsonify({ 'exception': "Function not available", "message": func }) except (KeyError, TypeError) as e: return exception_json(e, (func, parameters)) try: return ENDPOINTS[mimetype](data) except KeyError as e: # FIXME is the error message informative? # The previous message was: 'Endpoint not found' return exception_json(e, mimetype) except TypeError as e: return exception_json(e, (func, parameters, str(list(data))))
def test_data_generators(self): M = self.hs.channel_manager.memory T = self.hs.plugins.sklearn.tools data = load_iris() epochs = 10 seed = 42 batchsize = 2 data_tool = T.dataset(data, shuffle=True, epochs=epochs, seed=seed) data_stream = M.get_or_create_stream('dataset') model = 'Gaussian' anomaly_detector_tool = T.anomaly_detector(model) anomaly_detector_stream = M.get_or_create_stream('anomaly_detector') now = datetime.utcnow() now = (now - timedelta(hours=1)) before = datetime.utcfromtimestamp(0) ti = TimeInterval(before, now) data_tool.execute(sources=[], sink=data_stream, interval=ti) print("Example of a data stream") key, value = next(iter(data_stream.window())) print('[%s]: %s' % (key, value)) mini_batch_tool = T.minibatch(batchsize=batchsize) mini_batch_stream = M.get_or_create_stream('mini_batch') mini_batch_tool.execute(sources=[data_stream], sink=mini_batch_stream, interval=ti) anomaly_detector_tool.execute(sources=[mini_batch_stream], sink=anomaly_detector_stream, interval=ti) probas = [] for key, value in anomaly_detector_stream.window(): probas.append(value['proba']) # The data is repeated the number of epochs. This makes the mini-batches to # cycle and contain data from the beginning and end of the dataset. This # makes possible that the number of scores is not divisible by epochs. probas = np.array(probas) print(probas.shape) means = np.array([np.nanmean(aux) for aux in probas]) np.testing.assert_almost_equal(true_means, means, decimal=2) print(means.shape) print("Test probabilities per minibatch (cyclic)") print(means.round(decimals=2))
def test_sessions(self): hs = HyperStream(loglevel=logging.CRITICAL) print_sessions(hs) # hs.clear_sessions(inactive_only=False, clear_history=True) # TODO: this needs to clear stream definitions as well hs.clear_sessions(clear_history=True) print("after clearing") print_sessions(hs) assert (len(hs.sessions) == 0) del hs with HyperStream(loglevel=logging.CRITICAL) as hs: print("enter ...") print_sessions(hs) assert (len(hs.sessions) == 1) assert hs.current_session.active M = hs.channel_manager.memory dg = hs.plugins.data_generators ticker = M.get_or_create_stream("ticker") random = M.get_or_create_stream("random") ti = TimeInterval(t1, t1 + minute) hs.tools.clock().execute(sources=[], sink=ticker, interval=ti) dg.tools.random().execute(sources=[], sink=random, interval=ti, alignment_stream=ticker) history = hs.current_session.history for item in history: print(item) assert (history[0].value['tool'] == 'clock') assert (history[1].value['tool'] == 'random') assert (history[0].value['document_count'] == 60) assert (history[1].value['document_count'] == 60) print("exit ...") hs = HyperStream(loglevel=logging.CRITICAL) assert hs.current_session is None print_sessions(hs) assert (len(hs.sessions) == 1) assert hs.sessions[0].end is not None assert not hs.sessions[0].active
def run(house, wearables, loglevel=logging.CRITICAL): from hyperstream import HyperStream, TimeInterval if not globs['hyperstream']: globs['hyperstream'] = HyperStream(loglevel=loglevel, file_logger=None) display_predictions(globs['hyperstream'], TimeInterval.now_minus(minutes=1), house, wearables) print() from display_access_points import display_access_points display_access_points(house=house) print()
def _execute(self, sources, alignment_stream, interval): if interval.start < self.first: interval.start = self.first n_widths = int((interval.start - self.first).total_seconds() // self.width.total_seconds()) lower = self.first + n_widths * self.width upper = lower + self.width while upper <= interval.end: yield StreamInstance(upper, TimeInterval(lower, upper)) lower += self.increment upper += self.increment
def _execute(self, sources, alignment_stream, interval): data = sources[0].window(interval, force_calculation=True) try: experiment_ids = sources[1].window( interval, force_calculation=True).last().value except AttributeError: return mappings = [] for x in data: experiment_interval = TimeInterval(x.value['start'], x.value['end']) experiment_id = construct_experiment_id(experiment_interval) if experiment_id in experiment_ids: mappings.append((experiment_id, experiment_interval)) yield StreamInstance(interval.end, mappings)
def test_iris(self): M = self.hs.channel_manager.memory T = self.hs.plugins.sklearn.tools data = load_iris() epochs = 10 seed = 42 batchsize = 2 data_tool = T.dataset(data, shuffle=True, epochs=epochs, seed=seed) data_stream = M.get_or_create_stream('dataset') now = datetime.utcnow() now = (now - timedelta(hours=1)) before = datetime.utcfromtimestamp(0) ti = TimeInterval(before, now) data_tool.execute(sources=[], sink=data_stream, interval=ti) print("Example of a data stream") key, value = next(iter(data_stream.window())) print('[%s]: %s' % (key, value)) mini_batch_tool = T.minibatch(batchsize=batchsize) mini_batch_stream = M.get_or_create_stream('mini_batch') mini_batch_tool.execute(sources=[data_stream], sink=mini_batch_stream, interval=ti) key, value = mini_batch_stream.window().items()[0] assert(key == datetime(1970, 1, 1, 0, 2, tzinfo=UTC)) expected_value = {'x_te': np.array([[ 5.6, 2.8, 4.9, 2. ], [ 7.3, 2.9, 6.3, 1.8]]), 'x_tr': np.array([[ 6. , 2.2, 5. , 1.5], [ 5. , 2. , 3.5, 1. ]]), 'y_te': np.array([[0, 0, 1], [0, 0, 1]]), 'y_tr': np.array([[0, 0, 1], [0, 1, 0]])} for e_key, e_value in expected_value.items(): assert(e_key in value) np.testing.assert_equal(e_value, value[e_key])
def test_plugins(self): with HyperStream(file_logger=False, console_logger=False, mqtt_logger=None) as hs: M = hs.channel_manager.memory clock_tool = hs.tools.clock() dummy_tool = hs.plugins.example.tools.dummy() ticker = M.get_or_create_stream("ticker") ticker_copy = M.get_or_create_stream("ticker_copy") before = now - timedelta(seconds=30) ti = TimeInterval(before, now) clock_tool.execute(sources=[], sink=ticker, interval=ti) dummy_tool.execute(sources=[ticker], sink=ticker_copy, interval=ti) assert (all( map(lambda pair: pair[0].value == pair[1].value, zip(ticker.window(), ticker_copy.window()))))
def test_save_workflow(self): hs = HyperStream(file_logger=False, console_logger=False, mqtt_logger=None) workflow_id = sys._getframe().f_code.co_name # First delete the workflow if it's there hs.workflow_manager.delete_workflow(workflow_id) w = basic_workflow(hs, workflow_id) time_interval = TimeInterval(t1, t2) w.execute(time_interval) hs.workflow_manager.commit_workflow(workflow_id) # Now remove it from the workflow manager del hs.workflow_manager.workflows[workflow_id] # And then reload it hs.workflow_manager.load_workflow(workflow_id)