def __init__(self, descriptor): """ :param descriptor: The file descriptor to use """ self.descriptor = descriptor if not isinstance(self.descriptor, StringIO) and \ not "cStringIO" in str(self.descriptor.__class__) \ and not isinstance(self.descriptor, io.TextIOWrapper) \ and not isinstance(self.descriptor, io.BufferedReader): raise ValueError( "Provide a file descriptor as input! Make sure you are using a Python3 compatible descriptor such as io.open." ) self.rel_bs_time = 0 self.abs_bs_time = None self.cur_abs_time = None self.vent_bn = 0 self.rel_bn = 0 try: self.descriptor = clear_descriptor_null_bytes(self.descriptor) except UnicodeDecodeError: raise BadDescriptorError( 'You seem to have opened a file with garbled bytes. you should open it using io.open(file, encoding="ascii", errors="ignore"' ) self.descriptor.seek(0) first_line = self.descriptor.readline() self.bs_col, self.ncol, self.ts_1st_col, self.ts_1st_row = detect_version_v2( first_line) self.descriptor.seek(0)
def gather_flow_and_pressure(f_desc): """ Gather time, flow, and pressure data given some file descriptor of a b2c data file. Returns data as a list of tuples [ (t1, flow1, pressure1), (t2, flow2, pressure2), ... ] """ reader = csv.reader(f_desc) f_desc.seek(0) bs_col, ncol, tsfc, tssc = detect_version_v2(f_desc.readline()) f_desc.seek(0) data = [] t = 0 delta = 0.02 for line in reader: if line[bs_col].strip() == "BS": points = [] elif line[bs_col].strip() == "BE": data.append(points) else: t = t + delta points.append((t, float(line[bs_col]), float(line[bs_col + 1]))) return data
def cut_breath_section(descriptor, bn_start, bn_end, start_abs_bs): """ Cut up a file by relative breath number :param descriptor: file descriptor for file to chunk up :param bn_start: starting (inclusive) relative breath number :param bn_end: ending (inclusive) relative breath number :param start_abs_bs: because this function cuts off the absolute breath start timestamp we can provide a new one for the file if we need. If we dont care we can just provide None """ try: bn_start = int(bn_start) bn_end = int(bn_end) except: raise ValueError( "Must input bn_start and bn_end as integers! Your input " "bn_start: {}, bn_end: {}".format(bn_start, bn_end)) i = 0 bn = 0 # on APL the relative breath number starts at 1 if start_abs_bs: try: datetime.strptime(start_abs_bs, '%Y-%m-%d-%H-%M-%S.%f') except: raise Exception( 'start_abs_bs must be in format %Y-%m-%d-%H-%M-%S.%f') record_lines = False end_next = False lines_to_keep = [] bs_col, ncol, _, __ = detect_version_v2(descriptor.readline()) descriptor.seek(0) reader = csv.reader(descriptor) for line in reader: if not line: continue if line[bs_col].strip() == "BS": bn += 1 if bn == bn_start: record_lines = True if record_lines: lines_to_keep.append(i) if bn == bn_end and line[bs_col].strip() == "BE": descriptor.seek(0) lines = descriptor.read().split("\n") text = "\n".join(list(itemgetter(*lines_to_keep)(lines))) text = start_abs_bs + '\n' + text if start_abs_bs else text return StringIO(text) i += 1 else: raise Exception( "Something went wrong. The input breath numbers seem to " "be incorrect or the file format does not match a raw " "ventilator waveform file")
def cut_breath_section(descriptor, bn_start, bn_end): """ Cut up a file by relative breath number """ try: bn_start = int(bn_start) bn_end = int(bn_end) except: raise ValueError( "Must input bn_start and bn_end as integers! Your input " "bn_start: {}, bn_end: {}".format(bn_start, bn_end)) i = 0 bn = 0 # on APL the relative breath number starts at 1 record_lines = False end_next = False lines_to_keep = [] bs_col, ncol, _, __ = detect_version_v2(descriptor.readline()) descriptor.seek(0) reader = csv.reader(descriptor) for line in reader: if line[bs_col].strip() == "BS": bn += 1 if bn == bn_start: record_lines = True if record_lines: lines_to_keep.append(i) if bn == bn_end and line[bs_col].strip() == "BE": descriptor.seek(0) lines = descriptor.read().split("\n") return StringIO("\n".join(list(itemgetter(*lines_to_keep)(lines)))) i += 1 else: raise Exception( "Something went wrong. The input breath numbers seem to " "be incorrect or the file format does not match a raw " "ventilator waveform file")
def extract_raw(descriptor, ignore_missing_bes, rel_bn_interval=[], vent_bn_interval=[], spec_rel_bns=[], spec_vent_bns=[]): """ Takes a file descriptor and returns the raw data on the breath for us to use. Returns data in format { 'vent_bn': vent_bn, 't': [rel_t1, rel_t2, ...], 'ts': [ts1, ts2, ...], 'flow': [flow1, flow2, ...], 'pressure': [pressure1, pressure2, ...], 'be_count': be_count, 'bs_count': bs_count, .... } :param descriptor: The file descriptor to use :param ignore_missing_bes: boolean whether or not to ignore missing BEs in the data (False if we want to use breaths without a BE, True otherwise) :param rel_bn_interval: The relative [start, end] interval for the data :param vent_bn_interval: The vent bn [start, end] interval for the data :param spec_rel_bns: The specific relative bns that we want eg: [1, 10, 20] :param spec_vent_bns: The specific vent bns that we want eg: [1, 10, 20] """ # XXX You could probably save yourself a ton of time if you # processed the BS/BE rows to remove their trailing commas. # then you could use a method like np.genfromtext or something faster # than the native csv lib. def get_data(flow, pressure, t_array, ts_array, rel_bn, vent_bn, bs_count, be_count, last_t, t_delta): flow, pressure, t_array, ts_array = filter_arrays( flow, pressure, t_array, ts_array ) if flow: data_dict = { "rel_bn": rel_bn, "vent_bn": vent_bn, "flow": flow, "pressure": pressure, "t": t_array, "ts": ts_array, "bs_count": bs_count, "be_count": be_count, "bs_time": bs_time, "frame_dur": round(t_array[-1] + t_delta, 2), "dt": t_delta, } return data_dict if not isinstance(descriptor, StringIO) and not "cStringIO" in str(descriptor.__class__) and not isinstance(descriptor, io.TextIOWrapper) and not isinstance(descriptor, io.BufferedReader): raise ValueError("Provide a file descriptor as input! Make sure you are using a Python3 compatible descriptor such as io.open.") spec_rel_bns = sorted(spec_rel_bns) spec_vent_bns = sorted(spec_vent_bns) collection_start = False last_t = 0 # first data point starts at 0 bs_count = 0 be_count = 0 bs_time = 0.02 t_delta = 0.02 rel_ts = 0 vent_bn = 0 rel_bn = 0 has_bs = False idx = 0 flow, pressure, t_array, timestamp_array = reset_arrays(None, None, None, None) try: descriptor = clear_descriptor_null_bytes(descriptor) except UnicodeDecodeError: raise BadDescriptorError('You seem to have opened a file with garbled bytes. you should open it using io.open(file, encoding="ascii", errors="ignore"') reader = csv.reader(descriptor) data_dict = {} vent_bn_regex = re.compile("S:(\d+)") descriptor.seek(0) first_line = descriptor.readline() bs_col, ncol, ts_1st_col, ts_1st_row = detect_version_v2(first_line) if ts_1st_row: abs_time = datetime.strptime(first_line.strip('\r\n'), IN_DATETIME_FORMAT) descriptor.seek(0) for row in reader: try: row[bs_col] except IndexError: continue if row[bs_col].strip() == "BS": collection_start = True if not ignore_missing_bes and has_bs: data = get_data( flow, pressure, t_array, timestamp_array, rel_bn, vent_bn, bs_count, be_count, bs_time, t_delta ) if data: yield data bs_time = round(last_t + 0.02, 2) rel_ts = 0 bs_count += 1 rel_bn += 1 idx = 0 has_bs = True flow, pressure, t_array, timestamp_array = reset_arrays( flow, pressure, t_array, timestamp_array ) try: match = vent_bn_regex.search(row[bs_col + 1]) except IndexError: has_bs = False continue if not match: has_bs = False # Don't collect data for the breath continue vent_bn = int(match.groups()[0]) if rel_bn_interval and rel_bn > rel_bn_interval[1]: return elif vent_bn_interval and vent_bn > vent_bn_interval[1]: return elif spec_rel_bns and rel_bn > spec_rel_bns[-1]: return elif spec_vent_bns and vent_bn > spec_vent_bns[-1]: return elif vent_bn_interval and not (vent_bn_interval[0] <= vent_bn <= vent_bn_interval[1]): has_bs = False elif rel_bn_interval and not (rel_bn_interval[0] <= rel_bn <= rel_bn_interval[1]): has_bs = False elif spec_rel_bns and (rel_bn not in spec_rel_bns): has_bs = False elif spec_vent_bns and (vent_bn not in spec_vent_bns): has_bs = False elif row[bs_col].strip() == "BE": be_count += 1 has_bs = False data = get_data( flow, pressure, t_array, timestamp_array, rel_bn, vent_bn, bs_count, be_count, bs_time, t_delta ) if data: yield data bs_time = round(last_t + 0.02, 2) rel_ts = 0 else: if collection_start: # if there is stray data at the top of the file # make sure data is correctly formed try: float(row[ncol - 2]) float(row[ncol - 1]) except (IndexError, ValueError): continue last_t = round(last_t + .02, 2) if not has_bs: continue try: flow[idx] = round(float(row[ncol - 2]), 2) pressure[idx] = round(float(row[ncol - 1]), 2) except (IndexError, ValueError): continue t_array[idx] = round(rel_ts, 2) if ts_1st_col: timestamp_array[idx] = row[0] elif ts_1st_row: timestamp_array[idx] = (abs_time + timedelta(seconds=last_t)).strftime(OUT_DATETIME_FORMAT) rel_ts = round(rel_ts + t_delta, 2) idx += 1
def bs_be_denoting_extractor(descriptor, rel_bn_interval=[]): """ Takes a file descriptor without BS/BE markers and then adds BS and BE markers to it, and then returns the breath data generator from extract_raw :param descriptor: A file descriptor for a ventilator data file without BS or BE markers. """ last_bs_loc = None cur_bs_loc = None first_line = descriptor.readline() bs_col, ncol, ts_1st_col, ts_1st_row = detect_version_v2(first_line) if ts_1st_row: data = first_line else: descriptor.seek(0) data = "" breath_idx = 1 flow_min_threshold = 10 flow_diff_threshold = 5 n_last_flow_obs = 4 n_last_pressure_obs = 5 n_lookback = 4 n_lookback_fallback = 2 median_peep = 0 median_pip = 100 observations = np.genfromtxt(descriptor, delimiter=',') thresh_not_met = True peep_buffer = [] pip_buffer = [] pressure_buffer_len = 25 pressure_diff_frac = 0.7 # The current index we are at in observations variable will always be # i+n_last_flow_obs for i, obs in enumerate(observations[n_last_flow_obs:]): true_idx = i + n_last_flow_obs # We are always <n_last_flow_obs> ahead of i in the observations array flow_diff = obs[0] - observations[i,0] pressure_diff_thresh = (median_pip - median_peep) * pressure_diff_frac if obs[1] >= (median_peep + pressure_diff_thresh): thresh_not_met = False if thresh_not_met and obs[0] >= flow_min_threshold and flow_diff >= flow_diff_threshold: thresh_not_met = False for offset in range(n_lookback): if ( true_idx - (offset + 1) < 0 or observations[true_idx - (offset + 1), 0] < 0 ): last_bs_loc = cur_bs_loc # Would including the first negative point be best? Let's try # # Results indicate it's more of a problem than anything, but it # might be worth reinvestigation cur_bs_loc = true_idx - offset break else: last_bs_loc = cur_bs_loc cur_bs_loc = true_idx - n_lookback_fallback # XXX Current methodology just constructs a basic file descriptor # to pass to extract_raw. This is not very efficient, but was # easiest to code for evaluation of algorithms. Future engineering # can just modify this function to return our canonical data dict if last_bs_loc: data += ( "BS, S:{}\n".format(breath_idx) + fmt_as_csv(observations[last_bs_loc:cur_bs_loc]) + "\nBE\n" ) breath_idx += 1 if breath_idx != 1: peep_idx = cur_bs_loc - n_last_pressure_obs if cur_bs_loc - n_last_pressure_obs > 0 else 0 peep = np.mean(observations[peep_idx:true_idx,1]) pip = np.max(observations[last_bs_loc:cur_bs_loc,1]) if len(peep_buffer) < pressure_buffer_len: peep_buffer.append(peep) pip_buffer.append(pip) else: peep_buffer.pop(0) peep_buffer.append(peep) pip_buffer.pop(0) pip_buffer.append(pip) median_peep = np.median(peep_buffer) median_pip = np.median(pip_buffer) # when debugging the i index cannot be trusted as a gauge of time # in relation to the file with BS and BE. if breath_idx: #import IPython; IPython.embed() pass elif not thresh_not_met and obs[0] < flow_min_threshold and obs[1] < (median_peep + pressure_diff_thresh): thresh_not_met = True else: data += ( "BS, S:{}\n".format(breath_idx) + fmt_as_csv(observations[cur_bs_loc:]) + "\nBE\n" ) return extract_raw(StringIO(data), False, rel_bn_interval=rel_bn_interval)
def real_time_extractor(descriptor, ignore_missing_bes, rel_bn_interval=[], vent_bn_interval=[], spec_rel_bns=[], spec_vent_bns=[]): """ The exact same functionality as extract_raw, except this method returns a list of breaths and is also able to update timestamp based on whether/not a new timestamp is found in file. Both of these functions are necessary for real time TOR. In future, we might be able to consolidate this function with extract_raw, but for now this works fine and there is no need to expend the engineering effort :param descriptor: The file descriptor to use :param ignore_missing_bes: boolean whether or not to ignore missing BEs in the data (False if we want to use breaths without a BE, True otherwise) :param rel_bn_interval: The relative [start, end] interval for the data :param vent_bn_interval: The vent bn [start, end] interval for the data :param spec_rel_bns: The specific relative bns that we want eg: [1, 10, 20] :param spec_vent_bns: The specific vent bns that we want eg: [1, 10, 20] """ def get_data(flow, pressure, t_array, ts_array, rel_bn, vent_bn, bs_count, be_count, last_t, t_delta): flow, pressure, t_array, ts_array = filter_arrays( flow, pressure, t_array, ts_array ) if flow: data_dict = { "rel_bn": rel_bn, "vent_bn": vent_bn, "flow": flow, "pressure": pressure, "t": t_array, "ts": ts_array, "bs_count": bs_count, "be_count": be_count, "bs_time": bs_time, "frame_dur": t_array[-1] + t_delta, "dt": t_delta, } return data_dict if not isinstance(descriptor, StringIO) and not "cStringIO" in str(descriptor.__class__) and not isinstance(descriptor, io.TextIOWrapper): raise ValueError("Provide a file descriptor as input! Make sure you are using a Python3 compatible descriptor such as io.open.") if (len(rel_bn_interval) == 0 and len(vent_bn_interval) == 0 and len(spec_rel_bns) == 0 and len(spec_vent_bns) == 0): pass elif not xor( xor(len(rel_bn_interval) > 0, len(vent_bn_interval) > 0), xor(len(spec_rel_bns) > 0, len(spec_vent_bns) > 0) ): raise ValueError("You can only specify one vent or rel bn filtering option for use!") spec_rel_bns = sorted(spec_rel_bns) spec_vent_bns = sorted(spec_vent_bns) collection_start = False last_t = 0 # first data point starts at 0 bs_count = 0 be_count = 0 bs_time = 0.02 t_delta = 0.02 rel_ts = 0 vent_bn = 0 rel_bn = 0 has_bs = False idx = 0 flow, pressure, t_array, timestamp_array = reset_arrays(None, None, None, None) descriptor = clear_descriptor_null_bytes(descriptor) reader = csv.reader(descriptor) data_dict = {} data_list = [] vent_bn_regex = re.compile("S:(\d+)") date_search = re.compile("^20[12]\d-[01]\d-") descriptor.seek(0) first_line = descriptor.readline() # Should we be more strict and now allow breaths without a TS up top? bs_col, ncol, ts_1st_col, ts_1st_row = detect_version_v2(first_line) if ts_1st_row: abs_time = datetime.strptime(first_line.strip('\r\n'), IN_DATETIME_FORMAT) start_time = abs_time else: raise Exception("A breath timestamp must be on first row!") descriptor.seek(0) for row in reader: try: row[bs_col] except IndexError: continue # XXX fix bs_time! it is not accurate when we update the timestamp # # update abs time if date_search.search(row[0]): abs_time = datetime.strptime(row[0], IN_DATETIME_FORMAT) last_t = 0 bs_time = round((abs_time + timedelta(seconds=0.02) - start_time).total_seconds(), 2) continue if row[bs_col].strip() == "BS": collection_start = True if not ignore_missing_bes and has_bs: data = get_data( flow, pressure, t_array, timestamp_array, rel_bn, vent_bn, bs_count, be_count, bs_time, t_delta ) if data: data_list.append(data) bs_time = round((abs_time + timedelta(seconds=last_t) - start_time).total_seconds(), 2) rel_ts = 0 bs_count += 1 rel_bn += 1 idx = 0 has_bs = True flow, pressure, t_array, timestamp_array = reset_arrays( flow, pressure, t_array, timestamp_array ) try: match = vent_bn_regex.search(row[bs_col + 1]) except IndexError: has_bs = False continue if not match: has_bs = False # Don't collect data for the breath continue vent_bn = int(match.groups()[0]) if rel_bn_interval and rel_bn > rel_bn_interval[1]: break elif vent_bn_interval and vent_bn > vent_bn_interval[1]: break elif spec_rel_bns and rel_bn > spec_rel_bns[-1]: break elif spec_vent_bns and vent_bn > spec_vent_bns[-1]: break elif vent_bn_interval and not (vent_bn_interval[0] <= vent_bn <= vent_bn_interval[1]): has_bs = False elif rel_bn_interval and not (rel_bn_interval[0] <= rel_bn <= rel_bn_interval[1]): has_bs = False elif spec_rel_bns and (rel_bn not in spec_rel_bns): has_bs = False elif spec_vent_bns and (vent_bn not in spec_vent_bns): has_bs = False elif row[bs_col].strip() == "BE": be_count += 1 has_bs = False data = get_data( flow, pressure, t_array, timestamp_array, rel_bn, vent_bn, bs_count, be_count, bs_time, t_delta ) if data: data_list.append(data) bs_time = round((abs_time + timedelta(seconds=last_t) - start_time).total_seconds(), 2) rel_ts = 0 else: if collection_start: # if there is stray data at the top of the file # make sure data is correctly formed try: float(row[ncol - 2]) float(row[ncol - 1]) except (IndexError, ValueError): continue last_t = round(last_t + .02, 2) if not has_bs: continue try: flow[idx] = round(float(row[ncol - 2]), 2) pressure[idx] = round(float(row[ncol - 1]), 2) except (IndexError, ValueError): continue t_array[idx] = round(rel_ts, 2) if ts_1st_col: timestamp_array[idx] = row[0] elif ts_1st_row: timestamp_array[idx] = (abs_time + timedelta(seconds=last_t)).strftime(OUT_DATETIME_FORMAT) rel_ts = round(rel_ts + t_delta, 2) idx += 1 return data_list