def test_limit_to_span(self): evs = [ esal.Event(esal.Interval(1, 9), 'a', 1), ] expected = [ (esal.Interval(3, 7), 1), ] actual = list(events.periods(evs, 3, 7)) self.assertEqual(expected, actual)
def test_min_len_limit_to_span(self): evs = [ esal.Event(esal.Interval(1, 1), 'a', 1), ] expected = [ (esal.Interval(2, 5), 1), ] actual = list(events.periods(evs, 2, 5, min_len=7)) self.assertEqual(expected, actual)
def test_backoff_too_much(self): evs = [ esal.Event(esal.Interval(1, 1), 'a', 1), esal.Event(esal.Interval(5, 5), 'a', 2), ] expected = [ (esal.Interval(1, 1), 1), (esal.Interval(5, 9), 2), ] actual = list(events.periods(evs, 1, 9, min_len=5, backoff=5)) self.assertEqual(expected, actual)
def test_backoff_drop_empty_zero_val(self): evs = [ esal.Event(esal.Interval(1, 2), 'a', 1), esal.Event(esal.Interval(3, 4), 'a', 1), ] expected = [ (esal.Interval(1, 2), 1), (esal.Interval(3, 4), 1), ] actual = list(events.periods(evs, 0, 4, backoff=1)) self.assertEqual(expected, actual)
def test_fill_span(self): evs = [ esal.Event(esal.Interval(2, 3), 'a', 1), ] expected = [ (esal.Interval(1, 2), 0), (esal.Interval(2, 3), 1), (esal.Interval(3, 4), 0), ] actual = list(events.periods(evs, 1, 4)) self.assertEqual(expected, actual)
def test_backoff_non_overlapping(self): evs = [ esal.Event(esal.Interval(1, 1), 'a', 1), esal.Event(esal.Interval(5, 5), 'a', 2), ] expected = [ (esal.Interval(1, 3), 1), (esal.Interval(5, 8), 2), ] actual = list(events.periods(evs, 1, 8, min_len=3, backoff=2)) self.assertEqual(expected, actual)
def test_empty(self): evs = [] expected = [ (esal.Interval(0, 9), 0), ] actual = list(events.periods(evs, 0, 9)) self.assertEqual(expected, actual)
def test_zero_values(self): evs = [ esal.Event(esal.Interval(1, 7), 'a', '0'), ] expected = [ (esal.Interval(0, 1), 0), (esal.Interval(1, 7), '0'), (esal.Interval(7, 9), 0), ] actual = list(events.periods(evs, 0, 9, zero_values=(0, ))) self.assertEqual(expected, actual) expected = [ (esal.Interval(0, 9), 0), ] actual = list(events.periods(evs, 0, 9, zero_values=('0', ))) self.assertEqual(expected, actual)
def setUp(self): ev = esal.Event itvl1 = esal.Interval('2019-04-09', '2019-04-10') itvl2 = esal.Interval('2019-04-09') self.ev_seq = esal.EventSequence( events=[ ev(itvl1, ('mx', '4'), ('lo', None)), ev(itvl1, ('mx', '4'), ('hi', None)), ev(itvl1, ('mx', '5'), ('lo', None)), ev(itvl1, ('mx', '8'), ('hi', None)), ev(itvl1, ('mx', '0'), ('lo', None)), ev(itvl1, ('mx', '2'), ('ok', None)), ev(itvl1, ('mx', '1'), ('ok', None)), ev(itvl1, ('mx', '8'), ('ok', None)), ev(itvl2, ('dx', '2'), (None, None)), ev(itvl2, ('dx', '0'), (None, None)), ev(itvl2, ('dx', '7'), (None, None)), ev(itvl2, ('dx', '2'), (None, None)), ev(itvl2, ('dx', '1'), (None, None)), ev(itvl2, ('dx', '3'), (None, None)), ev(itvl1, ('rx', '0'), (None, None)), ev(itvl1, ('rx', '5'), (None, None)), ev(itvl1, ('rx', '0'), (None, None)), ev(itvl1, ('rx', '4'), (None, None)), ev(itvl1, ('rx', '1'), (None, None)), ev(itvl1, ('px', '3'), (None, None)), ev(itvl1, ('px', '2'), (None, None)), ev(itvl1, ('px', '5'), (None, None)), ev(itvl1, ('px', '2'), (None, None)), ev(itvl1, ('ox', '0'), (None, None)), ev(itvl1, ('ox', '6'), (None, None)), ev(itvl1, ('vx', '1'), (None, None)), ev(itvl1, ('vx', '1'), (None, None)), ev(itvl1, ('xx', None), (None, None)), ], facts=[ (('bx', 'dob'), '1949-04-09'), (('bx', 'ethn'), '1'), (('bx', 'gndr'), 'M'), (('bx', 'race'), '2'), (('hx', 'cancer-father'), 'yes'), (('hx', 'cancer-mother'), 'no'), ], id=808186755, )
def sequence( event_records, event_sequence_id=None, header_nm2idx=header_nm2idx, ): """ Construct an event sequence from the given records and return it. Any record in which the fields `lo` and `hi` are both `None` is treated as a fact. All other records are treated as events. event_records: Iterable of event records where each record is an indexable collection of values. event_sequence_id: ID for constructed event sequence. header_nm2idx: Mapping of event record field names to their indices in the record. Must include at least the following names: id, lo, hi, cat, typ, val, jsn. """ # Unpack indices of event record fields id_idx = header_nm2idx['id'] lo_idx = header_nm2idx['lo'] hi_idx = header_nm2idx['hi'] cat_idx = header_nm2idx['cat'] typ_idx = header_nm2idx['typ'] val_idx = header_nm2idx['val'] jsn_idx = header_nm2idx['jsn'] # Collect facts and events facts = [] evs = [] for ev_rec in event_records: # Fill in the ID if it hasn't been set if event_sequence_id is None: event_sequence_id = ev_rec[id_idx] # Get the event interval in order to distinguish between facts # and events lo = ev_rec[lo_idx] hi = ev_rec[hi_idx] # Missing times indicate a fact if lo is None and hi is None: fact = ((ev_rec[cat_idx], ev_rec[typ_idx]), ev_rec[val_idx]) facts.append(fact) # Otherwise this record is an event else: ev = esal.Event( esal.Interval(ev_rec[lo_idx], ev_rec[hi_idx]), (ev_rec[cat_idx], ev_rec[typ_idx]), (ev_rec[val_idx], ev_rec[jsn_idx])) evs.append(ev) return esal.EventSequence(evs, facts, event_sequence_id)
def test_backoff_zero_val(self): evs = [ esal.Event(esal.Interval(2, 3), 'a', 1), esal.Event(esal.Interval(5, 6), 'a', 1), esal.Event(esal.Interval(9, 10), 'a', 1), ] expected = [ (esal.Interval(0, 1), 0), (esal.Interval(2, 3), 1), (esal.Interval(5, 6), 1), (esal.Interval(7, 8), 0), (esal.Interval(9, 10), 1), ] actual = list(events.periods(evs, 0, 11, backoff=1)) self.assertEqual(expected, actual) expected.append((esal.Interval(11, 12), 0)) actual = list(events.periods(evs, 0, 12, backoff=1)) self.assertEqual(expected, actual)
def test_empty_span(self): evs = [ esal.Event(esal.Interval(3, 5), 'a', 1), esal.Event(esal.Interval(4, 6), 'a', 2), esal.Event(esal.Interval(5, 5), 'a', 3), esal.Event(esal.Interval(5, 7), 'a', 4), ] expected = [ (esal.Interval(5, 5), 1), (esal.Interval(5, 5), 2), (esal.Interval(5, 5), 3), (esal.Interval(5, 5), 4), ] actual = list(events.periods(evs, 5, 5)) self.assertEqual(expected, actual)
def test_min_len(self): evs = [ esal.Event(esal.Interval(1, 1), 'a', 1), esal.Event(esal.Interval(4, 5), 'a', 1), esal.Event(esal.Interval(7, 9), 'a', 1), ] expected = [ (esal.Interval(1, 3), 1), (esal.Interval(3, 4), 0), (esal.Interval(4, 6), 1), (esal.Interval(6, 7), 0), (esal.Interval(7, 9), 1), ] actual = list(events.periods(evs, 1, 9, min_len=2)) self.assertEqual(expected, actual)
def test_min_len_drop_empty_zero_val(self): evs = [ esal.Event(esal.Interval(0, 2), 'a', 0), esal.Event(esal.Interval(2, 2), 'a', 1), esal.Event(esal.Interval(2, 4), 'a', 0), esal.Event(esal.Interval(4, 4), 'a', 2), esal.Event(esal.Interval(4, 9), 'a', 0), ] expected = [ (esal.Interval(2, 4), 1), (esal.Interval(4, 6), 2), ] actual = list(events.periods(evs, 2, 6, min_len=2)) self.assertEqual(expected, actual)
def test_same_points(self): evs = [ esal.Event(esal.Interval(1, 1), 'a', 1), esal.Event(esal.Interval(1, 1), 'a', 1), esal.Event(esal.Interval(1, 1), 'a', 2), esal.Event(esal.Interval(1, 1), 'a', 2), ] expected = [ (esal.Interval(1, 1), 1), (esal.Interval(1, 1), 2), ] actual = list(events.periods(evs, 1, 1)) self.assertEqual(expected, actual)
def test_merge(self): evs = [ esal.Event(esal.Interval(1, 4), 'a', 1), esal.Event(esal.Interval(4, 8), 'a', 1), esal.Event(esal.Interval(6, 9), 'a', 2), ] expected = [ (esal.Interval(0, 1), 0), (esal.Interval(1, 6), 1), (esal.Interval(6, 9), 2), ] actual = list(events.periods(evs, 0, 9)) self.assertEqual(expected, actual)
def test_overlapping_intervals(self): evs = [ esal.Event(esal.Interval(1, 3), 'a', 1), esal.Event(esal.Interval(2, 4), 'a', 2), esal.Event(esal.Interval(3, 5), 'a', 1), esal.Event(esal.Interval(4, 6), 'a', 0), esal.Event(esal.Interval(5, 7), 'a', 2), esal.Event(esal.Interval(6, 8), 'a', 1), ] expected = [ (esal.Interval(0, 1), 0), (esal.Interval(1, 2), 1), (esal.Interval(2, 3), 2), (esal.Interval(3, 5), 1), (esal.Interval(5, 6), 2), (esal.Interval(6, 8), 1), (esal.Interval(8, 9), 0), ] actual = list(events.periods(evs, 0, 9)) self.assertEqual(expected, actual)
def periods( events, span_lo=None, span_hi=None, value=None, zero_values=(0, None), min_len=0, backoff=0, output_zero=0, ): """ Yield disjoint intervals corresponding to different values of the given events. Converts a sequence of events that approximately represent a signal into a guess at the underlying piecewise constant signal (a sequence of intervals that partitions a span of time, where each interval has a value). Assumes the given events are sorted by their start times. The conversion gives each interval a minimum length, unions intervals with the same value and then puts them in sequence by truncating an interval at the start of the next interval with a different, nonzero value (with optional back-off). Finally, fills in gaps with zero values. This is intended to be useful for constructing event "eras" where the values of an event are mutually exclusive (e.g. different dosages of a medication). For example, in the following, the top collection of intervals would be converted into the bottom sequence of intervals given min_len=6 and backoff=2. -------------------------------------------------- 222 22 11111 111 11111 11 11111 00000 00000 000000 -------------------------------------------------- 00 111111 1111111 22222222222 000000 111111111 000 -------------------------------------------------- events: Iterable of events. span_lo: Start (if any) of span to which events are clipped. span_hi: End (if any) of span to which events are clipped. value: Function to extract values from events: value(event) -> object. Default uses `esal.Event.value`. zero_values: Set of values to treat as zero (non-signal) and ignore. min_len: Minimum length of each interval (prior to any truncation). backoff: Size of gap between intervals. A larger gap increases the chances that an underlying transition from one value to the next happened in the gap. [TODO technically also need a starting lag / offset] output_zero: Value to use when filling in between nonzero values. """ prds = [] # Lengthen and clip nonzero periods for ev in events: # Ensure a minimum length before clipping lo = ev.when.lo hi = max(ev.when.hi, lo + min_len) val = value(ev) if value is not None else ev.value # Discard any events that are "non-events" (have zero value) or # that are outside the allowed span if (val in zero_values or (span_lo is not None and hi < span_lo) or (span_hi is not None and lo > span_hi)): continue # Clip to allowed span if span_hi is not None: hi = min(hi, span_hi) if span_lo is not None: lo = max(lo, span_lo) prds.append((lo, hi, val)) # Merge and sequentialize periods mrg_idx = 0 prd_idx = 1 while prd_idx < len(prds): lo1, hi1, val1 = prds[mrg_idx] lo2, hi2, val2 = prds[prd_idx] # Merge periods with the same value if hi1 >= lo2 and val1 == val2: prds[mrg_idx] = (lo1, hi2, val1) del prds[prd_idx] else: # Put periods in sequence by removing overlaps if hi1 > lo2: prds[mrg_idx] = (lo1, lo2, val1) mrg_idx += 1 prd_idx += 1 # Yield periods with intervening zero periods as needed. Separate # periods by backing off from the following nonzero event (if there # is one). zero_lo = span_lo for (idx, (lo, hi, val)) in enumerate(prds): # Yield a preceding zero period if it would be non-empty after # backing off from the current event zero_hi = lo - backoff if zero_lo < zero_hi: yield (esal.Interval(zero_lo, zero_hi), output_zero) # Back off from the following nonzero event if there is one hi_bk = (max(min(hi, prds[idx + 1][0] - backoff), lo) if idx + 1 < len(prds) else hi) yield (esal.Interval(lo, hi_bk), val) # Increment. Delay the zero period by the backoff amount. zero_lo = hi_bk + backoff if zero_lo < span_hi: yield (esal.Interval(zero_lo, span_hi), output_zero)
def mk_feature_vectors( events_csv_filename, examples_csv_filename, features_csv_filename, events_csv_format=events.csv_format, examples_csv_format=examples.csv_format, features_csv_format=csv_format, events_header=events.header(), examples_header=examples.header(), features_header=header(), events_header_detector=True, examples_header_detector=True, features_header_detector=True, include_event_record=None, transform_event_record=None, include_example_record=None, always_feature_keys=(), feature_function_namespaces=None, feature_function_modules=None, ): """ Make and yield feature vectors. Yields (example-label, example-weight, feature-vector) triples. """ # Unpack events header ev_hdr_nm2idx = {f[0]: i for i, f in enumerate(events_header)} ev_id_idx = ev_hdr_nm2idx['id'] # Unpack examples header ex_hdr_nm2idx = {f[0]: i for i, f in enumerate(examples_header)} ex_id_idx = ex_hdr_nm2idx['id'] ex_lo_idx = ex_hdr_nm2idx['lo'] ex_hi_idx = ex_hdr_nm2idx['hi'] # Load example definitions exs = records.read_csv(examples_csv_filename, examples_csv_format, examples_header, examples_header_detector, include_record=include_example_record) # Collect examples by ID id2ex = collections.defaultdict(list) for ex in exs: id2ex[ex[ex_id_idx]].append(ex) # Load feature definitions _, _, feat_key2idsfuncs = load( features_csv_filename, features_csv_format, features_header, features_header_detector, feature_function_namespaces, feature_function_modules, ) # Create a feature vector for each example definition. Only # construct event sequences for IDs that have examples. for ev_seq in events.read_sequences( records.read_csv( events_csv_filename, events_csv_format, events_header, header_detector=events_header_detector, parser=False, ), header=events_header, parse_id=events_header[ev_id_idx][1], include_ids=id2ex, parse_record=records.mk_parser(events_header), include_record=include_event_record, transform_record=transform_event_record, ): # Skip any IDs without examples for ex in id2ex.get(ev_seq.id, ()): # Create a subsequence that includes all the events that # overlap the example period itvl = esal.Interval(ex[ex_lo_idx], ex[ex_hi_idx]) subseq = ev_seq.subsequence( ev_seq.events_overlapping(itvl.lo, itvl.hi, itvl.is_lo_open, itvl.is_hi_open)) # Create feature vector fv = vector(feat_key2idsfuncs, ex, subseq, always_feature_keys) # Yield example and its feature fector yield ex, fv