def process_file(self, filename, flush_after=True): """Process the file specified by ``filename`` using a G3IndexedReader. Each frame from the file is passed to self.Process, with the optional index_info argument set to a dictionary containing the filename and byte_offset of the frame. Internal data grouping will be somewhat cleaner if the multiple files from a single aggregator "session" are passed to this function in acquisition order. In that case, call with flush_after=False. """ reader = so3g.G3IndexedReader(filename) while True: info = {'filename': filename, 'byte_offset': reader.Tell()} frames = reader.Process(None) assert len(frames) <= 1 if len(frames) == 0: break self(frames[0], info) # Calling flush() here protects us against the odd case that # we process files from a single session in non-consecutive # order. In that case the start' and 'end' times will get # messed up because we can't tell the stream has been # re-initialized. if flush_after: self.flush()
def load_status(self, time, show_pb=False): """ Returns the status dict at specified unix timestamp. Loads all status frames between session start frame and specified time. Args: time (timestamp): Time at which you want the rogue status Returns: status (dict): Dictionary of rogue variables at specified time. """ session = self.Session() session_start, = session.query(Frame.time).filter( Frame.type_name == 'Observation', Frame.time <= dt.datetime.fromtimestamp(time)).order_by( Frame.time.desc()).first() status_frames = session.query(Frame).filter( Frame.type_name == 'Wiring', Frame.time >= session_start, Frame.time <= dt.datetime.fromtimestamp(time)).order_by(Frame.time) status = {} cur_file = None for frame_info in tqdm(status_frames.all(), disable=(not show_pb)): file = frame_info.file.path if file != cur_file: reader = so3g.G3IndexedReader(file) cur_file = file reader.Seek(frame_info.offset) frame = reader.Process(None)[0] status.update(yaml.safe_load(frame['status'])) return SmurfStatus(status)
def add_file(self, path, session): """ Indexes a single file and adds it to the sqlite database. Args ---- path (path): Path of the file to index """ frame_types = { ft.type_name: ft for ft in session.query(FrameType).all() } db_file = Files(path=path) session.add(db_file) reader = so3g.G3IndexedReader(path) total_channels = 0 file_start, file_stop = None, None frame_idx = 0 while True: db_frame = Frame(frame_idx=frame_idx, file=db_file) db_frame.offset = reader.Tell() frames = reader.Process(None) if not frames: break frame = frames[0] frame_idx += 1 if str(frame.type) not in type_key: continue db_frame.frame_type = frame_types[str(frame.type)] timestamp = frame['time'].time / core.G3Units.s db_frame.time = dt.datetime.fromtimestamp(timestamp) data = frame.get('data') if data is not None: db_frame.samples = data.n_samples db_frame.channels = len(data) db_frame.start = dt.datetime.fromtimestamp(data.start.time / core.G3Units.s) db_frame.stop = dt.datetime.fromtimestamp(data.stop.time / core.G3Units.s) if file_start is None: file_start = db_frame.start file_stop = db_frame.stop total_channels = max(total_channels, db_frame.channels) session.add(db_frame) db_file.start = file_start db_file.stop = file_stop db_file.channels = total_channels db_file.frames = frame_idx
def unpack_frames(filename, field_request, streams): """Read frames from the specified file and expand the data by stream. Only the requested fields, specified through *_fields arguments, are expanded. Arguments: filename (str): Full path to the file to load. field_request: Instructions for what fields to load. streams: Structure to which to append the streams from this file (perhaps obtained from running unpack_frames on a preceding file). Returns: streams (structure containing lists of numpy arrays). """ if streams is None: streams = field_request.empty() reader = so3g.G3IndexedReader(filename) while True: frames = reader.Process(None) if len(frames) == 0: break frame = frames[0] if frame.type == g3core.G3FrameType.Scan: unpack_frame_object(frame, field_request, streams) return streams
def unpack_frames(filename, field_request, streams, samples=None): """Read frames from the specified file and expand the data by stream. Only the requested fields, specified through *_fields arguments, are expanded. Arguments: filename (str): Full path to the file to load. field_request: Instructions for what fields to load. streams: Structure to which to append the streams from this file (perhaps obtained from running unpack_frames on a preceding file). samples (int, int): Start and end of sample range to unpack *from this file*. First argument must be non-negative. Second argument may be None, indicating to read forever. Returns: streams (structure containing lists of numpy arrays). """ if streams is None: streams = field_request.empty() if samples is None: offset = 0 to_read = None else: offset, to_read = samples if to_read is not None: to_read -= offset reader = so3g.G3IndexedReader(filename) while to_read is None or to_read > 0: frames = reader.Process(None) if len(frames) == 0: break frame = frames[0] if frame.type != g3core.G3FrameType.Scan: continue _consumed = unpack_frame_object(frame, field_request, streams, offset=offset, max_count=to_read) offset -= _consumed if offset < 0: if to_read is not None: to_read += offset offset = 0 return streams
def test_50_compression(self): test_file = 'test_g3super.g3' # Entropy? sigma_bits = 8 sigma = 2**sigma_bits _get_ts = lambda dtype: self._get_ts( 100, 10000, sigma=sigma, dtype=dtype, seed=12345) w = core.G3Writer(test_file) sizes = {d: [] for d in ALL_DTYPES} for dtype in ALL_DTYPES: # No compression f = core.G3Frame() ts = _get_ts(dtype) sizes[dtype].append(ts.data.nbytes) ts.options(enable=0) f['ts_%s' % dtype] = ts w.Process(f) # Yes compression f = core.G3Frame() ts = _get_ts(dtype) f['ts_%s' % dtype] = ts w.Process(f) del w # Readback r = so3g.G3IndexedReader(test_file) last = 0 for dtype in ALL_DTYPES: for i in range(2): r.Process(None)[0] here = r.Tell() sizes[dtype].append(here - last) last = here # Process the results... for dtype in ALL_DTYPES: err_msg = f'Failed for dtype={dtype}' n, s_uncomp, s_comp = sizes[dtype] comp_ratio = 1. - (s_uncomp - s_comp) / n # But really what matters is the bits-per-word, compressed. bits_per_word = comp_ratio * 8 * np.dtype(dtype).itemsize #print(dtype, bits_per_word / sigma_bits) # I think the theoretical limit is 1.3 or so... self.assertLess(bits_per_word, sigma_bits * 1.4, err_msg)
def test_seek(self): """Test the Seek/Tell functionality of the G3IndexedReader. We read the first four frames, recording the position of the only Wiring frame in the file with Tell(). Then we Seek to that location and start reading again, expecting the first frame after Seek() to be the wiring frame. """ print("Testing Seek/Tell in G3IndexedReader") r = so3g.G3IndexedReader(self._file) # Limit the number of Process calls, if we hit the end of the file, # then Seek won't work... for i in range(4): pos = r.Tell() f = r.Process(None)[0] print(" " + str(f.type)) if f.type == core.G3FrameType.Wiring: w_pos = pos print(' Saved wiring frame position: {}'.format(w_pos)) r.Seek(w_pos) # Now that we've seeked, our next frame should be Wiring assert r.Process(None)[0].type == core.G3FrameType.Wiring
def get_data(self, field=None, start=None, end=None, min_stride=None, raw=False, short_match=False): """Load data from specified field(s) between specified times. Arguments ``field``, ``start``, ``end``, ``short_match`` are as described in _get_groups. Arguments: min_stride (float): Specifies the minimum sample spacing, in seconds. Ignored in this implementation. raw (bool): If true, return G3 blocks instead of numpy arrays. Returns: Pair of dictionaries, (data, timelines). The ``data`` dictionary is a simple map from field name to a numpy array of readings. The ``timelines`` dictionary is a map from field group name to a dictionary of timeline information, which has entries: - ``'t'``: numpy array of timestamps - ``'fields'``: list of fields belonging to this group. - ``'finalized_until'``: in cases where the data are still in flux, this field provides a timestamp that may be taken as the earliest time that needs to be requeried. This is part of the interface in order to support data streams that are being updated in real time. If user requested raw=True, then return value is a list of tuples of the form (group_name, block) where block is a single G3TimesampleMap carrying all the data for that co-sampled group. """ grouped = self._get_groups(field, start, end, short_match=short_match) hk_logger.debug('get_data: _get_groups returns %i groups.' % len(grouped)) # Pass through the metadata. Collect information on what # field groups are present in what frames of what files; sort # that info by file and offset so we make a single monotonic # pass through the frames. group_info = { # group_name: {'types': [dtype, ...], # 'fields': [(full_name, short_name), ...], # 'count': n}, # ... } files = { # filename: { # offset: [(block_index, group_name, output_offset), ...]], # ... # }, # ... } for group_name, fields, fgrps in grouped: # This is a group of co-sampled fields. The fields share # a sample count and a frame-index map. all_frame_refs = [] for fg in fgrps: all_frame_refs.extend( [(b['timestamp'], b['count'], b['filename'], b['byte_offset'], b['block_index']) for b in fg.index_info]) all_frame_refs.sort() vector_offset = 0 for _, n, filename, byte_offset, block_index in all_frame_refs: if not filename in files: files[filename] = {} if byte_offset not in files[filename]: files[filename][byte_offset] = [] files[filename][byte_offset].append((block_index, group_name, vector_offset)) vector_offset += n group_info[group_name] = { 'count': vector_offset, 'fields': [(f, f.split('.')[-1]) for f in fields], 'types': [], } # Pass through the data. Should read the files in order, # jumping monotonically through the needed frames. The data # type of output arrays is determined from whatever # np.array(G3Object) returns on the first block. Note strings # ('U') have to be handled differently because we can't know # the maximum string length from the first block. data = {} timelines = {} for filename, file_map in sorted(files.items()): hk_logger.info('get_data: reading %s' % filename) reader = so3g.G3IndexedReader(filename) for byte_offset, frame_info in sorted(file_map.items()): # Seek and decode. hk_logger.debug('get_data: seeking to %i for %i block extractions' % (byte_offset, len(frame_info))) reader.Seek(byte_offset) frames = reader.Process(None) assert(len(frames) == 1) frames = self.translator(frames[0]) frame = frames[0] # Now expand all blocks. for block_index, group_name, offset in frame_info: block = frame['blocks'][block_index] gi = group_info[group_name] if raw: # Short-circuit if in raw mode, just collect all blocks. if group_name not in data: data[group_name] = {} for field, f_short in gi['fields']: data[group_name] = [] data[group_name].append(block) continue if group_name not in timelines: # This block is init that happens only once per group. timelines[group_name] = { 't': np.zeros(gi['count']), 'fields': [f for f,s in gi['fields']], } hk_logger.debug('get_data: creating group "%s" with %i fields' % (group_name, len(gi['fields']))) # Determine data type of each field and create output arrays. for field, f_short in gi['fields']: dtype = np.array(block[f_short]).dtype gi['types'].append(dtype) if dtype.char == 'U': data[field] = [] else: data[field] = np.empty(gi['count'], dtype) hk_logger.debug('get_data: field "%s" has type %s' % ( f_short, dtype)) # Copy in block data. n = len(block.times) # Note this is in G3 time units for now... fixed later. timelines[group_name]['t'][offset:offset+n] = [_t.time for _t in block.times] for (field, f_short), dtype in zip(gi['fields'], gi['types']): if dtype.char == 'U': data[field].append((offset, list(map(str, block[f_short])))) else: # This is a relatively quick copy because # of buffer pass-through from G3... don't # hit the RHS with np.array! data[field][offset:offset+n] = block[f_short] if raw: return [(group_name, _concat_hk_stream(data[group_name])) for group_name, _, _ in grouped] # Finalize string fields. for group_name, fields, fgrps in grouped: gi = group_info[group_name] for (field, f_short), dtype in zip(gi['fields'], gi['types']): if dtype.char == 'U': data[field] = np.array(list(itertools.chain( *[x for i, x in sorted(data[field])]))) assert(len(data[field]) == gi['count']) # Scale out time units and mark last time. for timeline in timelines.values(): timeline['t'] /= core.G3Units.seconds timeline['finalized_until'] = timeline['t'][-1] return (data, timelines)
def load_data(self, start, end, show_pb=True, load_biases=True): """ Loads smurf G3 data for a given time range. For the specified time range this will return a chunk of data that includes that time range. Args ----- start (timestamp): start timestamp end (timestamp): end timestamp show_pb (bool, optional): If True, will show progress bar. load_biases (bool, optional): If True, will return biases. Returns -------- Returns a tuple ``SmurfData(times, data, primary, status, biases, timing_paradigm)`` with the following fields: times (np.ndarray[samples]): Array of unix timestamps for loaded data data (np.ndarray[channels, samples]): Array of the squid phase in units of radians for each channel with data in the specified time range. The index of the array is the readout channel number. primary (Dict[np.ndarray]): Dict of numpy arrays holding the "primary" data included in the packet headers, including 'AveragingResetBits', 'Counter0', 'Counter1', 'Counter2', 'FluxRampIncrement', 'FluxRampOffset', 'FrameCounter', 'TESRelaySetting', 'UnixTime' status (SmurfStatus): SmurfStatus object containing metadata info at the time of the first Scan frame in the requested interval. If there are no Scan frames in the interval, this will be None. biases (optional, np.ndarray[NTES, samples]): An array containing the TES bias values. If ``load_biases`` is False, this will be None. timing_paradigm(TimingParadigm): Tells you the method used to extract timestamps from the frame data. """ session = self.Session() frames = session.query(Frame).filter( Frame.type_name == 'Scan', Frame.stop >= dt.datetime.fromtimestamp(start), Frame.start < dt.datetime.fromtimestamp(end)).order_by(Frame.time) session.close() samples, channels = 0, 0 num_frames = 0 for f in frames: num_frames += 1 samples += f.samples channels = max(f.channels, channels) timestamps = np.full((samples, ), np.nan, dtype=np.float64) data = np.full((channels, samples), 0, dtype=np.int32) if load_biases: biases = np.full((num_bias_lines, samples), 0, dtype=np.int32) else: biases = None primary = {} cur_sample = 0 cur_file = None timing_paradigm = None for frame_info in tqdm(frames, total=num_frames, disable=(not show_pb)): file = frame_info.file.path if file != cur_file: reader = so3g.G3IndexedReader(file) cur_file = file reader.Seek(frame_info.offset) frame = reader.Process(None)[0] nsamp = frame['data'].n_samples key_order = [int(k[1:]) for k in frame['data'].keys()] data[key_order, cur_sample:cur_sample + nsamp] = frame['data'] if load_biases: bias_order = [int(k[-2:]) for k in frame['tes_biases'].keys()] biases[bias_order, cur_sample:cur_sample + nsamp] = frame['tes_biases'] # Loads primary data if 'primary' in frame.keys(): for k, v in frame['primary'].items(): if k not in primary: primary[k] = np.zeros(samples, dtype=np.int64) primary[k][cur_sample:cur_sample + nsamp] = v ts, paradigm = get_sample_timestamps(frame) if timing_paradigm is None: timing_paradigm = paradigm elif timing_paradigm != paradigm: timing_paradigm = TimingParadigm.Mixed timestamps[cur_sample:cur_sample + nsamp] = ts cur_sample += nsamp # Conversion from DAC counts to squid phase rad_per_count = np.pi / 2**15 data = data * rad_per_count if len(timestamps) > 0: status = self.load_status(timestamps[0]) else: status = None SmurfData = namedtuple( 'SmurfData', 'times data primary status biases timing_paradigm') if load_biases: return SmurfData(timestamps, data, primary, status, biases, timing_paradigm) else: return SmurfData(timestamps, data, primary, status, None, timing_paradigm)
def get_data(self, field=None, start=None, end=None, min_stride=None, raw=False, short_match=False): """Load data from specified field(s) between specified times. Arguments ``field``, ``start``, ``end``, ``short_match`` are as described in _get_groups. Returns: Pair of dictionaries, (data, timelines). The ``data`` dictionary is a simple map from field name to a numpy array of readings. The ``timelines`` dictionary is a map from field group name to a dictionary of timeline information, which has entries: - ``'t'``: numpy array of timestamps - ``'fields'``: list of fields belonging to this group. - ``'finalized_until'``: in cases where the data are still in flux, this field provides a timestamp that may be taken as the earliest time that needs to be requeried. This is part of the interface in order to support data streams that are being updated in real time. """ grouped = self._get_groups(field, start, end, short_match=short_match) handles = {} # filename -> G3IndexedReader map. blocks_out = [] for group_name, fields, fgrps in grouped: blocks_in = [] for fg in fgrps: for r in fg.index_info: fn, off = r['filename'], r['byte_offset'] if not fn in handles: handles[fn] = so3g.G3IndexedReader(fn) handles[fn].Seek(off) fn = handles[fn].Process(None) assert (len(fn) == 1) # Find the right block. for blk in fn[0]['blocks']: test_f = fields[0].split('.')[-1] ## dump prefix. if test_f in blk.data.keys(): blocks_in.append(blk) break # Sort those blocks by timestamp. (Otherwise they'll stay sorted by object id :) blocks_in.sort(key=lambda b: b.t[0]) # Create a new Block for this group. blk = so3g.IrregBlockDouble() blk.t = np.hstack([b.t for b in blocks_in]) for f in fields: blk.data[f] = np.hstack( [b.data[f.split('.')[-1]] for b in blocks_in]) blocks_out.append((group_name, blk)) if raw: return blocks_out # Reformat for sisock. data = {} timelines = {} for group_name, block in blocks_out: timelines[group_name] = { 't': np.array(block.t), 'finalized_until': block.t[-1], 'fields': list(block.data.keys()), } for k, v in block.data.items(): data[k] = np.array(v) return (data, timelines)