def test_event_descriptor_insertion(): # format some data keys for insertion data_keys = {'some_value': {'source': 'PV:pv1', 'shape': [1, 2], 'dtype': 'array'}, 'some_other_val': {'source': 'PV:pv2', 'shape': [], 'dtype': 'number'}, 'data_key3': {'source': 'PV:pv1', 'shape': [], 'dtype': 'number', 'external': 'FS:foobar'}} time = ttime.time() # test insert ev_desc_uid = mdsc.insert_descriptor(run_start_uid, data_keys, time, str(uuid.uuid4())) ev_desc_mds, = mdsc.find_descriptors(uid=ev_desc_uid) # make sure the sanitized event descriptor has no uid check_for_id(ev_desc_mds) # make sure the event descriptor is pointing to the correct run start referenced_run_start = ev_desc_mds['run_start'] assert referenced_run_start.uid == run_start_uid assert ev_desc_mds['time'] == time for k in data_keys: for ik in data_keys[k]: assert ev_desc_mds.data_keys[k][ik] == data_keys[k][ik]
def test_event_descriptor_insertion(): # format some data keys for insertion data_keys = {'some_value': {'source': 'PV:pv1', 'shape': [1, 2], 'dtype': 'array'}, 'some_other_val': {'source': 'PV:pv2', 'shape': [], 'dtype': 'number'}, 'data_key3': {'source': 'PV:pv1', 'shape': [], 'dtype': 'number', 'external': 'FS:foobar'}} time = ttime.time() # test insert ev_desc_uid = mdsc.insert_descriptor(run_start_uid, data_keys, time, str(uuid.uuid4())) ev_desc_mds, = mdsc.find_descriptors(uid=ev_desc_uid) # make sure the sanitized event descriptor has no uid check_for_id(ev_desc_mds) # make sure the event descriptor is pointing to the correct run start referenced_run_start = ev_desc_mds['run_start'] assert_equal(referenced_run_start.uid, run_start_uid) assert_equal(ev_desc_mds['time'], time) for k in data_keys: for ik in data_keys[k]: assert_equal(ev_desc_mds.data_keys[k][ik], data_keys[k][ik])
def get_events(headers, fields=None, fill=True): """ Get Events from given run(s). Parameters ---------- headers : Header or iterable of Headers The headers to fetch the events for fields : list, optional whitelist of field names of interest; if None, all are returned fill : bool, optional Whether externally-stored data should be filled in. Defaults to True Yields ------ event : Event The event, optionally with non-scalar data filled in """ # A word about the 'fields' argument: # Notice that we assume that the same field name cannot occur in # more than one descriptor. We could relax this assumption, but # we current enforce it in bluesky, so it is safe for now. try: headers.items() except AttributeError: pass else: headers = [headers] if fields is None: fields = [] fields = set(fields) for header in headers: descriptors = find_descriptors(header['start']['uid']) for descriptor in descriptors: all_fields = set(descriptor['data_keys']) if fields: discard_fields = all_fields - fields else: discard_fields = [] if discard_fields == all_fields: continue for event in get_events_generator(descriptor): for field in discard_fields: del event.data[field] del event.timestamps[field] if fill: fill_event(event) yield event
def test_find_events_smoke(): num = 50 rs, e_desc, data_keys = setup_syn() all_data = syn_data(data_keys, num) mdsc.bulk_insert_events(e_desc, all_data, validate=False) mdsc.insert_run_stop(rs, ttime.time(), uid=str(uuid.uuid4())) mdsc.clear_process_cache() # make sure the uid works next(mdsc.find_events(descriptor=e_desc)) mdsc.clear_process_cache() descriptor, = mdsc.find_descriptors(uid=e_desc) mdsc.clear_process_cache() # make sure that searching by descriptor document works next(mdsc.find_events(descriptor=descriptor))
def get_table(headers, fields=None, fill=True, convert_times=True): """ Make a table (pandas.DataFrame) from given run(s). Parameters ---------- headers : Header or iterable of Headers The headers to fetch the events for fields : list, optional whitelist of field names of interest; if None, all are returned fill : bool, optional Whether externally-stored data should be filled in. Defaults to True convert_times : bool, optional Whether to convert times from float (seconds since 1970) to numpy datetime64, using pandas. True by default. Returns ------- table : pandas.DataFrame """ # A word about the 'fields' argument: # Notice that we assume that the same field name cannot occur in # more than one descriptor. We could relax this assumption, but # we current enforce it in bluesky, so it is safe for now. try: headers.items() except AttributeError: pass else: headers = [headers] if fields is None: fields = [] fields = set(fields) dfs = [] for header in headers: descriptors = find_descriptors(header['start']['uid']) for descriptor in descriptors: all_fields = set(descriptor['data_keys']) if fields: discard_fields = all_fields - fields else: discard_fields = [] if discard_fields == all_fields: continue is_external = _inspect_descriptor(descriptor) payload = get_events_table(descriptor) descriptor, data, seq_nums, times, uids, timestamps = payload df = pd.DataFrame(index=seq_nums) if convert_times: times = pd.to_datetime( pd.Series(times), unit='s', utc=True).dt.tz_localize(TZ) df['time'] = times for field, values in six.iteritems(data): if field in discard_fields: logger.debug('Discarding field %s', field) continue if is_external[field] and fill: logger.debug('filling data for %s', field) # TODO someday we will have bulk retrieve in FS values = [fs.retrieve(value) for value in values] df[field] = values dfs.append(df) if dfs: return pd.concat(dfs) else: # edge case: no data return pd.DataFrame()
def __call__(self, **kwargs): """Given search criteria, find Headers describing runs. This function returns a list of dictionary-like objects encapsulating the metadata for a run -- start time, instruments used, and so on. In addition to the Parameters below, advanced users can specifiy arbitrary queries that are passed through to mongodb. Parameters ---------- start_time : time-like, optional Include Headers for runs started after this time. Valid "time-like" representations are: - float timestamps (seconds since 1970), such as time.time() - '2015' - '2015-01' - '2015-01-30' - '2015-03-30 03:00:00' - Python datetime objects, such as datetime.datetime.now() stop_time: time-like, optional Include Headers for runs started before this time. See `start_time` above for examples. beamline_id : str, optional String identifier for a specific beamline project : str, optional Project name owner : str, optional The username of the logged-in user when the scan was performed scan_id : int, optional Integer scan identifier uid : str, optional Globally unique id string provided to metadatastore data_key : str, optional The alias (e.g., 'motor1') or PV identifier of data source Returns ------- data : list Header objects Examples -------- >>> DataBroker(start_time='2015-03-05', stop_time='2015-03-10') >>> DataBroker(data_key='motor1') >>> DataBroker(data_key='motor1', start_time='2015-03-05') """ data_key = kwargs.pop('data_key', None) run_start = find_run_starts(**kwargs) if data_key is not None: node_name = 'data_keys.{0}'.format(data_key) query = {node_name: {'$exists': True}} descriptors = [] for rs in run_start: descriptor = find_descriptors(run_start=rs, **query) for d in descriptor: descriptors.append(d) # query = {node_name: {'$exists': True}, # 'run_start_id': {'$in': [ObjectId(rs.id) for rs in run_start]}} # descriptors = find_descriptors(**query) result = [] known_uids = deque() for descriptor in descriptors: if descriptor['run_start']['uid'] not in known_uids: rs = descriptor['run_start'] known_uids.append(rs['uid']) result.append(rs) run_start = result result = [] for rs in run_start: result.append(Header.from_run_start(rs)) return result