def _execute(self, sources, alignment_stream, interval): if self.names and len(self.names) != len(sources): raise TypeError( "Tool AlignedMerge expected {} streams as input, got {} instead" .format(len(self.names), len(sources))) streams = [ iter(source.window(interval, force_calculation=True)) for source in sources ] # Take data from the execute while True: try: docs = [next(stream) for stream in streams] times = [tt for (tt, dd) in docs] for tt in times[1:]: if tt != times[0]: raise ValueError( "Tool AlignedMerge expects aligned streams, " "but received conflicting timestamps {} and {}". format(times[0], tt)) values = [dd for (tt, dd) in docs] if self.names is None: yield StreamInstance(times[0], values) else: # noinspection PyTypeChecker yield StreamInstance( times[0], dict([(name, values[i]) for i, name in enumerate(self.names)])) except StopIteration: break
def _execute(self, source, interval): source_last_doc = source.window(interval, force_calculation=True).last() if not source_last_doc: return timestamp, data = source_last_doc if self.element is None: data_element = data else: if self.element in data: data_element = data[self.element] else: return try: # try if data_element can be used as a dict for key, value in data_element.items(): if self.use_value_instead_of_key: yield StreamMetaInstance( StreamInstance(timestamp=timestamp, value=value), value) else: yield StreamMetaInstance( StreamInstance(timestamp=timestamp, value=value), key) except AttributeError: # otherwise assume that data_element can be used as a list for value in data_element: yield StreamMetaInstance( StreamInstance(timestamp=timestamp, value=value), value)
def _execute(self, sources, alignment_stream, interval): for time, data in sources[0].window(interval, force_calculation=True): if self.complement: if data[self.key] not in self.values: yield StreamInstance(time, data) else: if self.key in data and data[self.key] in self.values: yield StreamInstance(time, data)
def _execute(self, sources, alignment_stream, interval): for time, data in sources[0].window(interval, force_calculation=True): dict_mean = dict() if len(data)==0: yield StreamInstance(time, dict_mean) inv_len_data = 1/float(len(data)) for item in data: for key in item.keys(): try: dict_mean[key] = dict_mean[key] + item[key]*inv_len_data except KeyError: dict_mean[key] = item[key]*inv_len_data yield StreamInstance(time, dict_mean)
def _execute(self, sources, alignment_stream, interval): time_interval = TimeInterval(MIN_DATE, interval.end) param_doc = sources[0].window(time_interval, force_calculation=True).last() if param_doc is None: logging.debug("No model found in {} for time interval {}".format( sources[0].stream_id, time_interval)) return steps = deserialise_json_pipeline( { 'vectorisation': DictVectorizer(sparse=False), 'fill_missing': FillZeros(), 'classifier': LinearDiscriminantAnalysis(), 'label_encoder': LabelEncoder() }, param_doc.value) clf = Pipeline([(kk, steps[kk]) for kk in ('vectorisation', 'fill_missing', 'classifier')]) locations = steps['label_encoder'].classes_ data = sources[1].window(interval, force_calculation=True) for tt, dd in data: yield StreamInstance( tt, { locations[ii]: pp for ii, pp in enumerate(clf.predict_proba(dd)[0]) })
def _execute(self, sources, alignment_stream, interval): for tt, rows in sources[0].window(interval, force_calculation=True): vals = defaultdict(list) for row in rows: try: for kk, vv in iter(row.value): if isinstance(vv, (int, float)): vals[kk].append(vv) except AttributeError: # This is not iterable, try to apply directly here if isinstance(row, (int, float)): vals[None].append(row) if len(vals) == 1 and None in vals: result = self.func(iter(vals[None])) else: result = {} for kk, vv in iter(vals): x = self.func(vv) if x is not None: result[kk] = x # result = {kk: self.func(vv) for kk, vv in iter(vals)} if result is not None: yield StreamInstance(tt, result)
def _execute(self, source, interval): timestamp, data = source.window(interval, force_calculation=True).last() if self.element in data: for key, value in data[self.element].items(): yield StreamMetaInstance( StreamInstance(timestamp=timestamp, value=value), key)
def _execute(self, sources, alignment_stream, interval): for time, data in sources[0].window(interval, force_calculation=True): try: yield StreamInstance(time, data[self.index]) except: logging.debug('The array could not be sliced with {}'.format( self.index))
def _execute(self, sources, alignment_stream, interval): max_interval = TimeInterval(MIN_DATE, interval.end) exp_list = {} for timestamp, value in sources[0].window(max_interval, force_calculation=True): if value['tier'] != "Experiment": continue d = deepcopy(value) mongo_id = d.pop('_id') trigger = d.pop('trigger') if trigger == 1: u = {'start': timestamp} else: u = {'end': timestamp} if mongo_id in exp_list: if u.keys()[0] in exp_list[mongo_id]: raise ValueError("Duplicate {} triggers found for timestamp {}".format(trigger, timestamp)) exp_list[mongo_id].update(u) else: d.update(u) exp_list[mongo_id] = d for i, doc in enumerate(exp_list.values()): if TimeInterval(doc['start'], doc['end']) in max_interval: yield StreamInstance(doc['end'], doc)
def test_index_of_by_stream(self): w = basic_workflow(sys._getframe().f_code.co_name) aggregate_loc = channels.get_tool( name="index_of_by_stream", parameters=dict(index="kitchen") ) # Create a stream with the single value "location" in it w.create_node(stream_name="selector_meta_data", channel=A, plate_ids=None) A.write_to_stream(stream_id=StreamId(name="selector_meta_data"), data=StreamInstance(timestamp=utcnow(), value="location")) N = w.nodes w.create_factor( tool=aggregate_loc, sources=[N["selector_meta_data"], N["rss"]], sink=N["rss_kitchen"] ) time_interval = TimeInterval(scripted_experiments[0].start, scripted_experiments[0].start + 2 * minute) w.execute(time_interval) key = h1 + (('location', 'kitchen'),) + wA assert all(a == b for a, b in zip(N['rss_kitchen'].streams[h1 + wA].window(time_interval).head(10), N['rss'].streams[key].window(time_interval).head(10)))
def _execute(self, sources, alignment_stream, interval): # Put all of the data in a dict of sorted lists (inefficient!) data = dict((source.stream_id, sorted(source.window(interval, force_calculation=True), key=lambda x: x.timestamp)) for source in sources) # Create a set of all of the timestamps available (also inefficient!) timestamps = sorted(set(item.timestamp for d in data.values() for item in d)) # maintain dict of indices where the timestamps appear last_timestamps = dict((stream_id, MIN_DATE) for stream_id in data) # Now loop through the timestamps, and aggregate over the aggregation plate for ts in timestamps: values = [] for stream_id in data: for item in data[stream_id]: if item.timestamp < last_timestamps[stream_id]: continue if item.timestamp < ts: continue if item.timestamp == ts: values.append(item.value) last_timestamps[stream_id] = item.timestamp break yield StreamInstance(ts, self.func(values))
def _execute(self, sources, alignment_stream, interval): for tt, rows in sources[0].window(interval, force_calculation=True): values = defaultdict(list) for row in rows: # try: # for kk, vv in iter(row.value): for kk, vv in row.iteritems(): if isinstance(vv, (int, float)): values[kk].append(vv) # except AttributeError: # # This is not iterable, try to apply directly here # if isinstance(row, (int, float)): # values[None].append(row) else: # raise NotImplementedError # TODO: Need to store other things like the uid for output pass # if len(values) == 1 and None in values: # yield StreamInstance(tt, self.func(iter(values[None]))) # else: for kk, vv in values.iteritems(): try: result = self.func(kk, vv) if result is not None: v = {kk: result} yield StreamInstance(tt, v) except KeyError: pass
def _execute(self, sources, alignment_stream, interval): for time, data in sources[0].window(interval, force_calculation=True): d = {} for k, v in data.items(): if k in self.keys: d[k] = v yield StreamInstance(time, d)
def _execute(self, sources, alignment_stream, interval): for time, data in sources[0].window(interval, force_calculation=True): t6 = time - timedelta(seconds=6) t2 = time - timedelta(seconds=2) tap_list = [] ok = True for i in range(len(data)): (t,magnitude) = data[i] if magnitude<1.8: continue if (t<t6) or (t>t2): ok = False break # too high magnitude outside of the 4 sec window if magnitude<1.8: continue if (i==0) or (i==len(data)): ok = False break # no taps counted at the ends of the window if (magnitude > data[i-1][1]) and (magnitude > data[i+1][1]): tap_list.append(data[i]) if ok and (len(tap_list)>=3): res = '' wearable = [w for (s,w) in sources[0].stream_id.meta_data if s=='wearable'][0] res = '\n'.join(['{0} {1:.2} {2:%Y-%m-%d %H:%M:%S.%f}'.format(wearable,tap.value,tap.timestamp) for tap in tap_list])+'\n' print(res) yield StreamInstance(time, dict(tap_list=tap_list,all_10_sec=data))
def _execute(self, sources, interval): if self.categorical: for t, d in sources[0].window(interval, force_calculation=True): yield StreamInstance(t, dict(map(safe_key, Counter(d).items()))) else: if self.breaks is not None: breaks = self.breaks else: breaks = [ self.first_break + i * self.break_width for i in range(self.n_breaks) ] breaks = [-float('inf')] + breaks + [float('inf')] for t, d in sources[0].window(interval, force_calculation=True): yield StreamInstance(t, np.histogram(d, breaks)[0].tolist())
def _execute(self, sources, alignment_stream, interval): if self.percentiles is not None: percentiles = self.percentiles else: percentiles = [i*100.0/self.n_segments for i in range(self.n_segments+1)] for t, d in sources[0].window(interval, force_calculation=True): yield StreamInstance(t, np.percentile(d, percentiles).tolist())
def _execute(self, sources, alignment_stream, interval): s1 = next(sources[1]) # TODO: should the loop below be: for (t, data1) in sources[0].execute(interval)? for (t, data1) in sources[0]: (_, data2) = next(s1) # TODO: type checking key/value pairs? yield StreamInstance(t, data1 * data2)
def reformat(doc): dt = doc.pop('datetime') if 'house_id' in doc: house_id = doc.pop('house_id') else: house_id = '1' return StreamMetaInstance(stream_instance=StreamInstance(dt, doc), meta_data=('house', house_id))
def _execute(self, sources, alignment_stream, interval): for time, data in sources[0].window(interval, force_calculation=True): dict_sum = self.start_dict if dict_sum is None: dict_sum = dict() if len(data)==0: yield StreamInstance(time, dict_sum) for item in data: for key in item.keys(): try: dict_sum[key] = dict_sum[key] + item[key] except KeyError: if self.log_new_keys: logging.warn('Novel key in ListDictSum: {}'.format(key)) if self.insert_new_keys: dict_sum[key] = item[key] yield StreamInstance(time, dict_sum)
def _execute(self, sources, alignment_stream, interval): sliding_window = sources[0].window(interval, force_calculation=True) data = iter(sources[1].window(interval, force_calculation=True)) window = [] future = [] for time, rel_window in sliding_window: lower = rel_window.start upper = rel_window.end # Prune the old data points from the window num_to_remove = 0 for win_time, win_data in window: if lower < win_time <= upper: # MK: changed from lower <= win_time <= upper break num_to_remove += 1 window = window[num_to_remove:] # Add those stolen from the future num_to_remove = 0 for doc in future: fut_time, fut_data = doc # if lower <= fut_time <= upper: (MK: this was a bug because things in the far future were thrown away from the future # break if fut_time > upper: # added by MK: if in the far future, then must remain in future break num_to_remove += 1 if fut_time >= lower: window.append(doc) future = future[num_to_remove:] # Take data from the execute while True: try: doc = next(data) tt, dd = doc if lower < tt <= upper: # MK: changed from lower <= win_time <= upper window.append(doc) elif tt > upper: future.append(doc) break except StopIteration: break value = [stream_instance.value for stream_instance in window] if len(value) > 0: yield StreamInstance(time, value) else: # TODO: Should we yield anything??? # yield StreamInstance(time, {}) pass
def _execute(self, sources, alignment_stream, interval): s0 = sources[0].window(interval, force_calculation=True) s1 = sources[1].window(interval, force_calculation=True) for (d0, d1) in zip(s0, s1): if d0.timestamp != d1.timestamp: raise ValueError("{} tool expects aligned timestamps".format( self.name)) yield StreamInstance(d0.timestamp, d0.value * d1.value)
def _execute(self, sources, alignment_stream, interval): for time, data in sources[0].window(interval, force_calculation=True): max_value = None argmax = None for key in data.keys(): if max_value is None or data[key] > max_value: max_value = data[key] argmax = key yield StreamInstance(time, argmax)
def reformat(doc): doc = deepcopy(doc) dt = doc.pop('datetime') if 'hid' in doc and doc['hid'] is not None: house_id = doc.pop('hid') else: house_id = '1' return StreamMetaInstance(stream_instance=StreamInstance(dt, doc), meta_data=('house', house_id))
def _execute(self, sources, alignment_stream, interval): if interval.start < self.first: interval.start = self.first n_strides = int((interval.start - self.first).total_seconds() // self._stride.total_seconds()) t = self.first + n_strides * self._stride while t <= interval.end: if t > interval.start: yield StreamInstance(t, t) t += self._stride
def _execute(self, sources, alignment_stream, interval): sliding_window = sources[0].window(interval, force_calculation=True) data = iter(sources[1].window(interval, force_calculation=True)) window = [] future = [] for time, rel_window in sliding_window: lower = rel_window.start upper = rel_window.end # Prune the old data points from the window num_to_remove = 0 for win_time, win_data in window: if lower <= win_time <= upper: break num_to_remove += 1 window = window[num_to_remove:] # Add those stolen from the future num_to_remove = 0 for doc in future: fut_time, fut_data = doc if lower <= fut_time <= upper: break num_to_remove += 1 window.append(doc) future = future[num_to_remove:] # Take data from the execute while True: try: doc = next(data) tt, dd = doc if lower <= tt <= upper: window.append(doc) elif tt > upper: future.append(doc) break except StopIteration: break # print interval.start, interval.end # print '\t', lower, upper # for datum in execute: # print '\t\t{} {}'.format(datum.timestamp, datum.value) # print '\t', self.func(execute) # print yield StreamInstance(time, self.func(iter(window)))
def _execute(self, sources, alignment_stream, interval): for i, row in self.data.iterrows(): dt = unix2datetime(row["dt"]) if dt in interval: yield StreamInstance( dt, dict(camera_id=row["camera_id"], exper_id=row["exper_id"], person_id=row["person_id"], wearable_id=row["wearable_id"]))
def _execute(self, sources, alignment_stream, interval): data = list(sources[0].window(interval, force_calculation=True)) flattened = map( lambda x: dict( dict(timestamp=x.timestamp, fold=x.value['localisation-experiment'], location=next(iter(x.value['annotations']['Location']), None)), **(x.value['rssi'])), data) df = pd.DataFrame(flattened) yield StreamInstance(interval.end, df)
def _execute(self, sources, alignment_stream, interval): data = sources[0].window(interval, force_calculation=True) mappings = [] for x in data: experiment_interval = TimeInterval(x.value['start'], x.value['end']) experiment_id = construct_experiment_id(experiment_interval) if experiment_id in self.experiment_ids: mappings.append((experiment_id, experiment_interval)) yield StreamInstance(interval.end, mappings)
def _execute(self, sources, alignment_stream, interval): source = sources[0] data = list(source.window(interval, force_calculation=True)) classifier_name = dict(source.stream_id.meta_data)['localisation_model'] if classifier_name == "lda": classifier = LinearDiscriminantAnalysis() elif classifier_name == "svm": classifier = OneVsRestClassifier(LinearSVC()) else: raise NotImplementedError("Unknown classifier type {}".format(classifier_name)) if not data: return yy_key = 'annotations' xx_key = 'rssi' ex_key = 'localisation-experiment' # TODO: change data to go from ['anno']['Location'] to just ['anno'} keep_inds = [] for di, (tt, dd) in enumerate(data): exp = dd[ex_key] loc = list(dd[yy_key]['Location']) if len(loc) == 1 and loc[0] != 'MIX' and exp != 'MIX': keep_inds.append(di) folds = [data[ii].value[ex_key] for ii in keep_inds] train_x = [data[ii].value[xx_key] for ii in keep_inds] train_y = [list(data[ii].value[yy_key]['Location'])[0] for ii in keep_inds] # TODO: update ['anno']['Location'] keys format changed label_encoder = LabelEncoder() train_y_trans = label_encoder.fit_transform(train_y) param_dict = { 'vectorisation': DictVectorizer(sparse=False), 'fill_missing': FillZeros(self.nan_value), 'classifier': classifier, } clf = Pipeline([(kk, param_dict[kk]) for kk in ('vectorisation', 'fill_missing', 'classifier')]) clf.fit(train_x, train_y_trans) clf_serialised = serialise_pipeline(clf) clf_serialised['label_encoder'] = serialise_dict(label_encoder.__dict__) clf_serialised['performance'] = predefined_train_test_split(train_x, train_y_trans, folds, clf, label_encoder) experiment_ids = sources[0].stream_id.name.split('_')[-2:] clf_serialised['experiment_ids_str'] = '_'.join(experiment_ids) clf_serialised['experiment_interval'] = map(reconstruct_interval, experiment_ids) clf_serialised['tool_parameters'] = dict((x, self.__dict__[x]) for x in self.__dict__ if not x.startswith("_")) yield StreamInstance(interval.end, clf_serialised)
def _execute(self, sources, alignment_stream, interval): data = list(sources[0].window(interval, force_calculation=True)) flattened = map(lambda x: dict(dict( experiment_id=construct_experiment_id(TimeInterval(x.value['start'], x.value['end'])), start=x.value['start'], end=x.value['end'], annotator=x.value['annotator'] ), **(x.value['notes'])), data) df = pd.DataFrame(flattened) df['id'] = range(1, len(df) + 1) yield StreamInstance(interval.end, df)