def write_dataset_to_file(filename, type, length): """ Write a preprocessed dataset with a given segment length to a given file. The file format is one row of space-separated values for each point. :param filename: the file's name. :param type: the signal type (e.g., "ABP"). :param length: the number of samples each segment is made of. :return: nothing """ ECG_lists = collections.defaultdict(list) waveforms = get_waveforms_list() for cpair in waveforms: folder = cpair[0] record = cpair[1] record_header = wfdb.rdheader(record, pbdir='mimic3wdb/{}/{}/'.format(folder, record)) segments = record_header.segname for segment in segments: if is_segment(segment): # Read segment's header file. print(segment) # TODO: change to read content. segment_content = wfdb.rdheader(segment, pbdir='mimic3wdb/{}/{}/'.format(folder, record)) #TODO: preprocess and slice on the go, we have no space for the whole thing. if type == "ABP": pass
def extract_labs(infile): """ extract_labs(infile) Take all lab values in the mimic3 db for infile TODO: Will need to build option to include only labs/notes in the period where there is waveform/numeric data but for now we include everything so it is available for context (eg echo reports) Parameters ---------- infile: string filename of a wfdb file from the MIMIC3 matched dataset origin: datetime the base datetime for the file return: notes DataFrame containing notes, times, etc """ # get patient ID subj_id = patient_id_from_file(infile) #get basetime origin = wfdb.rdheader(infile).base_datetime #get lab_events for this patient con = open_db() query = \ """ SELECT e.charttime, e.itemid, e.value, e.valuenum, e.valueuom, e.flag, i.label, i.fluid, i.category, i.loinc_code FROM labevents e INNER JOIN d_labitems i ON e.itemid = i.itemid WHERE subject_id = {}; """.format(subj_id) labs = pd.read_sql_query(query, con) #convert time origin = pd.to_datetime(wfdb.rdheader(infile).base_datetime) labs.insert(0, 'time', '') for idx, row in labs.iterrows(): labs['time'].iloc[idx] = int( (pd.to_datetime(row['charttime']) - origin).total_seconds()) del labs['charttime'] return (labs)
def extract_labels(self, filepath): p_index = -1 # if self.args.database == "PTB" or self.args.database == "ECG-ID": for folders in os.listdir(filepath): if (folders.startswith('Person_') or folders.startswith('patient')): p_index += 1 self.persons_labels.append(folders) if self.args.database == "PTB": for onepersonsdir in os.listdir( os.path.join(filepath, folders)): if onepersonsdir.endswith('hea'): ecg_record = wfdb.rdheader( os.path.join(filepath, folders, onepersonsdir.split(".", 1)[0])) patient_status = ecg_record.comments[4].split( ":")[1].strip() self.health_labels.append(patient_status) break # only read hea file one time per persone to know the health status # if (onepersonsdir.startswith('rec_1.') and onepersonsdir.endswith('hea')): # with open(os.path.join(filepath, folders, onepersonsdir),"r") as f: # array2d = [[str(token) for token in line.split()] for line in f] # self.age_labels.append(array2d[4][2]) # self.gender_labels.append(array2d[5][2]) # self.date_labels.append(array2d[6][3]) # f.close() if (folders.endswith('csv')): # This is for MIT-BIH database p_index += 1 basename = folders.split(".", 1)[0] # rec_1 rec_2.... self.persons_labels.append( basename) # no classify to person1 person2 dir
def writePatientData(): f = open('../ptbdb/RECORDS', 'r') o = open('../out/patientAttr.txt', 'w') for line in f: pat = line.strip() rfile = '../ptbdb/' + pat record = wfdb.rdheader(rfile) diagnoses = rutil.extratPatientDiagnoses(record) attributes = rutil.extractPatientAttributes(record) o.write(pat) o.write(',') o.write(rfile) for attr in attributes: o.write(',') o.write(attr) o.write(',') o.write(diagnoses[0]) o.write('\n') # record = wfdb.rdsamp('../ptbdb/patient002/s0015lre') # rutil.showGraph(record) #writeAllDiagnosisToFile() #rman.separateRecords()
def _load_header(self, rec:str) -> dict: """ finished, checked, load header data into a dict Parameters: ----------- rec: str, name of the record Returns: -------- header_dict: dict, """ header_dict = ED({}) rec_fp = os.path.join(self.db_dir, rec) header_reader = wfdb.rdheader(rec_fp) header_dict['units'] = header_reader.units header_dict['baseline'] = header_reader.baseline header_dict['adc_gain'] = header_reader.adc_gain header_dict['record_fmt'] = header_reader.fmt try: header_dict['age'] = int([l for l in header_reader.comments if '<age>' in l][0].split(': ')[-1]) except: header_dict['age'] = np.nan try: header_dict['sex'] = [l for l in header_reader.comments if '<sex>' in l][0].split(': ')[-1] except: header_dict['sex'] = '' d_start = [idx for idx, l in enumerate(header_reader.comments) if '<diagnoses>' in l][0] + 1 header_dict['diagnoses'] = header_reader.comments[d_start:] return header_dict
def read_records(dataset_name, data_path, sample_size_seconds=30, samples_per_second=250, num_records=None): samples = [] labels = [] total_read_records = 0 for record_name in wfdb.get_record_list(dataset_name): header = wfdb.rdheader(data_path + record_name) if header.sig_len == 0: continue offset = 0 samples_count = 0 while True: record, ann, offset = read_record(data_path, header, offset, sample_size_seconds, samples_per_second) if record is None: break samples.append(record) labels.append(ann.aux_note) samples_count += 1 total_read_records += 1 if num_records is not None and total_read_records == num_records: break labels = np.array([1 if '(AFIB' in key else 0 for key in labels]) return samples, labels
def _load_header(self, rec:str) -> dict: """ finished, checked, load header data into a dict Parameters ---------- rec: str, name of the record Returns ------- header_dict: dict, """ header_dict = ED({}) rec_fp = os.path.join(self.db_dir, rec) header_reader = wfdb.rdheader(rec_fp) header_dict["units"] = header_reader.units header_dict["baseline"] = header_reader.baseline header_dict["adc_gain"] = header_reader.adc_gain header_dict["record_fmt"] = header_reader.fmt try: header_dict["age"] = int([l for l in header_reader.comments if "<age>" in l][0].split(": ")[-1]) except: header_dict["age"] = np.nan try: header_dict["sex"] = [l for l in header_reader.comments if "<sex>" in l][0].split(": ")[-1] except: header_dict["sex"] = "" d_start = [idx for idx, l in enumerate(header_reader.comments) if "<diagnoses>" in l][0] + 1 header_dict["diagnoses"] = header_reader.comments[d_start:] return header_dict
def find_admission(filename): ''' Get admission information from MIMIC III filename (contains subj ID) Return demographic information ''' subj_id = 0 hadm_id = 0 diagnosis = '' expired = '' death_time = '' ethnicity = '' subj_id = patient_id_from_file(filename) print('searching {}'.format(subj_id)) # get additional demographics con = open_db() query = \ """ SELECT i.subject_id, i.gender, i.dob FROM patients i WHERE subject_id = {}; """.format(subj_id) demo = pd.read_sql_query(query, con) gender = demo.gender.values[0] dob = demo.dob.values[0] record = wfdb.rdheader(filename) sig_start = record.base_datetime print('Signal file start {}'.format(sig_start)) admits = get_admissions(subj_id) for idx, row in admits.iterrows(): adm_time = row['admittime'] dsc_time = row['dischtime'] print('Admission # {}, in at {} out at {}'.format( row['hadm_id'], adm_time, dsc_time)) if (sig_start > adm_time) and (sig_start < dsc_time): print('Subject {}, record {}, diagnosis: {}. HADM {} '.format( row['subject_id'], record.record_name, row['diagnosis'], row['hadm_id'])) hadm_id = row['hadm_id'] diagnosis = row['diagnosis'] expired = row['hospital_expire_flag'] death_time = row['deathtime'] ethnicity = row['ethnicity'] age = round( pd.to_timedelta(adm_time - dob) / pd.to_timedelta(365, unit='d')) return (subj_id, hadm_id, age, gender, ethnicity, diagnosis, expired, death_time)
def rdann_by_type( rec_path: str, ann_ext: str, from_time: str = None, to_time: str = None, types: str = WFDB_ANN_ALL_PEAK_TYPES, ): """ Reads WFDB annotation file and returns annotations of specific types. :param rec_path: Record path (without extension). :param ann_ext: extension of annotation file to load. :param from_time: Start time. A string in PhysioNet time format [1]_. :param to_time: End time. A string in PhysioNet time format [1]_. :param types: A string of chars of the annotation types to find (see [2]_). :return: A dictionary, mapping from the annotation type (a char) to a numpy array of indices in the signal. .. [1] https://www.physionet.org/physiotools/wag/intro.htm#time .. [2] https://www.physionet.org/physiobank/annotations.shtml """ if not is_record(rec_path, ann_ext=ann_ext): raise ValueError(f"Can't find record {rec_path}") ann_to_idx = {ann_type: [] for ann_type in types} # In case it's a Path object; wfdb can't handle that rec_path = str(rec_path) # Handle from/to by converting to samples sampfrom, sampto = 0, sys.maxsize if from_time is not None or to_time is not None: header = wfdb.rdheader(rec_path) if from_time is not None: sampfrom = wfdb_time_to_samples(from_time, header.fs) if to_time is not None: sampto = wfdb_time_to_samples(to_time, header.fs) # Read annotations ann = wfdb.rdann(rec_path, ann_ext, sampfrom, sampto) # Find annotations of requested type annotations_pattern = re.compile(fr"[{types}]") joined_ann = str.join("", ann.symbol) matches = list(annotations_pattern.finditer(joined_ann)) for i, m in enumerate(matches): ann_type = m.group() ann_idx = m.start() # Save annotation sample ann_to_idx[ann_type].append(ann.sample[ann_idx]) return { ann_type: np.array(idxs) for (ann_type, idxs) in ann_to_idx.items() }
def header_reader(potassium_df=potassium_df): from datetime import datetime as dt # for i in tqdm(range(len(potassium_df))): for i in range(100): charttime = dt.strptime(potassium_df.iloc[i]['CHARTTIME'], '%Y-%m-%d %H:%M:%S') target_file = pd.DataFrame(index=[], columns=['file', 'diff']) for j in range(58): file = potassium_df.iloc[i][str(j)] if file is np.nan: continue filename = os.path.join(potassium_df.iloc[i]['dir'], file) filename = filename[:-4] record = wfdb.rdheader(filename) sig_len = record.sig_len date_time = record.base_datetime fs = record.fs #K測定時刻と心電図開始時間の照合 diff_time = date_time - charttime pot_len = diff_time.seconds * fs #print(i,j) #print(charttime) #print(pot_len) if (pot_len > sig_len): #headerのどこにも合わない場合にはcontinueで時間節約 continue #全体のheader fileから各ファイルの名前とseq長の表を作成 header_table_df = pd.DataFrame(np.vstack( [record.seg_name, record.seg_len]).T, columns=['file', 'sequence']) #header_table_df=header_table_df.query('file.str.match("[0-9]{7}_[0-9]{4}")') header_table_df['cumsum'] = header_table_df['sequence'].astype( int).cumsum() #header_table_df.query('cumsum-pot_len>0').iloc[0] #K測定時刻に相当するファイルを取り出す header_table_df['diff'] = header_table_df['cumsum'] - pot_len # target fileにはpotassium測定時刻に対応するfile名file、当該時刻における初めからの系列の長さを記録 if len(target_file.index) == 1: print('{},{}: target file already exists!!'.format(i, j)) target_file = header_table_df.query('diff>0') if len(target_file.index) == 0: continue elif len(target_file.index) == 1: #print('---') target_file = target_file.iloc[0][['file', 'diff']] #print(i,j) #print(target_file) #print('test') potassium_df['file'][i] = target_file['file'] potassium_df['diff'][i] = target_file['diff'] return (potassium_df)
def ecgrr( rec_path, ann_ext=None, channel=None, from_time=None, to_time=None, detector=qrs.ecgpuwave_detect_rec, dtype=np.float32, ): """ Returns an RR-interval time-series given a PhysioNet record. :param rec_path: The path to the record (without any file extension). :param ann_ext: Extension of annotation file to use. If provided, R-peaks will be read from this annotation file instead of performing peak-detection. :param channel: Number of ECG channel in the record. Will be heuristically estimated if missing. :param from_time: Start time. A string in the PhysioNet time format. :param to_time: End time. A string in the PhysioNet time format. :param detector: A function to use for peak-detection. Will only be used if the ann_ext parameter was not provided. :param dtype: Desired dtype of output tensors. :return: Tuple of time axis and interval durations. """ if not utils.is_record(rec_path, ann_ext=ann_ext): raise ValueError(f"Can't find record {rec_path}") if ann_ext is not None: # Load r-peaks from annotation ann_type = "N" ann = utils.rdann_by_type(rec_path, ann_ext, from_time, to_time, types=ann_type) sample_idxs = ann[ann_type] else: # Calculate r-peaks using a peak-detector sample_idxs = detector(rec_path, channel=channel, from_time=from_time, to_time=to_time) header = wfdb.rdheader(rec_path) fs = float(header.fs) start_time = sample_idxs[0] / fs rri = np.diff(sample_idxs) / fs trr = np.empty_like(rri) np.cumsum(rri[0:-1], out=trr[1:]) trr[0] = 0.0 trr += start_time return trr.astype(dtype), rri.astype(dtype)
def get_wf_header(stay_id: str, pn_dir: str): wf = None sleep_period = 1 while not wf: try: wf = wfdb.rdheader(stay_id, pn_dir=pn_dir) except (ConnectionError, MaxRetryError, requests.exceptions.ConnectionError): time.sleep(sleep_period) sleep_period *= 2 return wf
def pickle_ECG_headers(): """ Scan the dataset to build a list of Segments (containing length, record and types). Then pickle it to "mimicIII-headers-dict.pickle". This is done through dataset header files. Useful fields: siglen - integer signal length in terms of samples (there's 125 samples per second, as 'fs' can testify) signame - list of signal types. Accepted ECG types are stored in a dictionary. segname - list of segment names, contained in a record's header. :return: nothing. """ ECG_types = {"I", "II", "III", "AVR", "AVL", "AVF", "V", "V1", "V2", "V3", "V4", "V5", "MLI", "MLII", "MLIII", "ABP"} ECG_list = [] waveforms = get_waveforms_list() for cpair in waveforms: folder = cpair[0] record = cpair[1] record_header = wfdb.rdheader(record, pbdir='mimic3wdb/{}/{}/'.format(folder, record)) segments = record_header.segname for segment in segments: if is_segment(segment): # Read segment's header file. print(segment) segment_header = wfdb.rdheader(segment, pbdir='mimic3wdb/{}/{}/'.format(folder, record)) # Append signal lengths. ctypes = set(segment_header.signame) cintersection = ctypes.intersection(ECG_types) if bool(cintersection): # If nonempty intersection. ECG_list.append(Segment(segment_header.siglen, int(record), cintersection)) with open('mimicIII-headers-list.pickle', 'wb') as file: pickle.dump(ECG_list, file)
def find_ecg_channel(rec_path): """ Heuristically finds the index of the first ECG channel in a record. :param rec_path: Path to record without extension. :return: The index of the first ECG channel. """ pattern = re.compile(ECG_CHANNEL_PATTERN, re.IGNORECASE) header = wfdb.rdheader(rec_path) for i, name in enumerate(header.sig_name): if pattern.match(name): return i return None
def _get_channels_to_read(self, rec_name): header = wfdb.rdheader(rec_name) matching_channels = [ chan_idx for chan_idx, chan_name in enumerate(header.sig_name) if self.channel_pattern.search(chan_name) is not None ] if self.first_channel_only and len(matching_channels) > 0: return matching_channels[0:1] else: return matching_channels
def load_beat_ann(self, rec: str, sampfrom: Optional[int] = None, sampto: Optional[int] = None, keep_original: bool = False) -> Dict[str, np.ndarray]: """ finished, checked, load beat annotations, which are stored in the `symbol` attribute of corresponding annotation files Parameters: ----------- rec: str, name of the record sampfrom: int, optional, start index of the annotations to be loaded sampto: int, optional, end index of the annotations to be loaded keep_original: bool, default False, if True, indices will keep the same with the annotation file otherwise subtract `sampfrom` if specified Returns: -------- ann, dict, locations (indices) of the all the beat types ("A", "N", "Q", "V",) """ fp = os.path.join(self.db_dir, rec) header = wfdb.rdheader(fp) sig_len = header.sig_len sf = sampfrom or 0 st = sampto or sig_len assert st > sf, "`sampto` should be greater than `sampfrom`!" wfdb_ann = wfdb.rdann( fp, extension=self.manual_ann_ext, sampfrom=sampfrom or 0, sampto=sampto, ) ann = ED({k: [] for k in self.all_beat_types}) for idx, bt in zip(wfdb_ann.sample, wfdb_ann.symbol): if bt not in self.all_beat_types: continue ann[bt].append(idx) if not keep_original and sampfrom is not None: ann = ED( {k: np.array(v, dtype=int) - sampfrom for k, v in ann.items()}) else: ann = ED({k: np.array(v, dtype=int) for k, v in ann.items()}) return ann
def readWFBDBasicInfo(path): ''' just input a path and showing the infomation of the file, nothing else ''' HDR = wfdb.rdheader(path) print("channel \t sig_name \t Unit \t baseline \t adc_zero") print("===============================================") for i in range(HDR.n_sig): print(" {} \t {} \t {} \t {} \t {} ".format(i, HDR.sig_name[i], HDR.units[i], HDR.baseline[i], HDR.adc_zero[i]))
def read_record(path): hdr = wfdb.rdheader(path) if hdr.sig_len < max_block_size: data, rate = soundfile.read(path + '.flac', dtype='int16', always_2d=True) else: data = _read_blocks(hdr, path) hdr.p_signal = to_physical(data, hdr) return hdr
def writeAllDiagnosisToFile(): #figure out what all possible diagnosis contained within this database possible = [] f = open('../ptbdb/RECORDS', 'r') for line in f: rfile = '../ptbdb/' + line.strip() record = wfdb.rdheader(rfile) diagnosis = rutil.extratPatientDiagnoses(record) if not any(diagnosis in s for s in possible): possible.append(diagnosis) diagnosesfile = open('../out/possible-diagnoses.txt', 'w') for d in possible: diagnosesfile.write(d + '\n')
class Record(): def __init__(self, record_dir: Path, case: str): reco = wfdb.rdrecord(str(record_dir)) head = wfdb.rdheader(str(record_dir)) self.record_dir = record_dir self.case = case self.name = head.record_name self.time = head.base_time self.date = head.base_date self.fs = reco.fs self.slen = reco.sig_len self.n_sig = reco.n_sig self.sig_names = reco.sig_name self.units = reco.units self.rr = None
def read(record_name): """ Read a record and the pH """ import wfdb record = wfdb.rdrecord(f'{PATH}/{record_name}') header = wfdb.rdheader(f'{PATH}/{record_name}') comments = header.comments ph = float([c for c in comments if 'pH' in c][0].replace('pH', '').strip()) fhr, uc = record.p_signal[:,0], record.p_signal[:,1] return fhr, uc, ph
def get_single_patiennce_data( patience, dataset_path='Data/StPeterburg/', ): PatienceData = {} for label in classify_label: PatienceData[label] = [] sample_rate = wfdb.rdheader(dataset_path + patience).__dict__['fs'] all_middle_qrs_annotation = wfdb.rdann(dataset_path + patience, 'atr').sample all_middle_qrs_label = wfdb.rdann(dataset_path + patience, 'atr').symbol patience_signal, _ = wfdb.rdsamp(dataset_path + patience) patience_signal = np.asarray(patience_signal).T curent_signal_idx = 0 curent_annotation_idx = 0 while curent_signal_idx < len(patience_signal[0]): middle_qrs_of_signal = [] middle_qrs_label_of_signal = [] while curent_annotation_idx < len( all_middle_qrs_annotation ) and all_middle_qrs_annotation[ curent_annotation_idx] <= curent_signal_idx + 10 * sample_rate: middle_qrs_of_signal.append( all_middle_qrs_annotation[curent_annotation_idx]) middle_qrs_label_of_signal.append( all_middle_qrs_label[curent_annotation_idx]) curent_annotation_idx += 1 mean_interval = (middle_qrs_of_signal[-1] - middle_qrs_of_signal[0]) // len(middle_qrs_of_signal) # print(middle_qrs_of_signal) # print(middle_qrs_label_of_signal) for idx, label in enumerate(middle_qrs_label_of_signal): if label in classify_label: PatienceData[label].append( pad( zero_one_scale( patience_signal[:, middle_qrs_of_signal[idx]: middle_qrs_of_signal[idx] + int(1.2 * mean_interval)]))) curent_signal_idx += 10 * sample_rate return PatienceData
def extract_notes(infile): """ extract_notes(infile) Take all notes in the mimic3 db for infile TODO: Will need to build option to include only labs/notes in the period where there is waveform/numeric data but for now we include everything so it is available for context (eg echo reports) Parameters ---------- infile: string filename of a wfdb file from the MIMIC3 matched dataset origin: datetime the base datetime for the file return: notes DataFrame containing notes, times, etc """ # get patient ID subj_id = patient_id_from_file(infile) #get lab_events for this patient con = open_db() query = \ """ SELECT i.chartdate, i.charttime, i.description, i.category, i.text FROM noteevents i WHERE subject_id = {}; """.format(subj_id) notes = pd.read_sql_query(query, con) """ change time stamp to seconds from origin """ origin = pd.to_datetime(wfdb.rdheader(infile).base_datetime) notes.insert(0, 'time', '') for idx, row in notes.iterrows(): notes['time'].iloc[idx] = int( (pd.to_datetime(row['charttime']) - origin).total_seconds()) del notes['charttime'] del notes['chartdate'] return (notes)
def load_label(self, rec: str, ann: Optional[wfdb.Annotation] = None, sampfrom: Optional[int] = None, sampto: Optional[int] = None, fmt: str = "a") -> str: """ finished, checked, load (classifying) label of the record, among the following three classes: "non atrial fibrillation", "paroxysmal atrial fibrillation", "persistent atrial fibrillation", Parameters ---------- rec: str, name of the record ann: Annotation, optional, not used, to keep in accordance with other methods sampfrom: int, optional, not used, to keep in accordance with other methods sampto: int, optional, not used, to keep in accordance with other methods fmt: str, default "a", format of the label, case in-sensitive, can be one of: "f", "fullname": the full name of the label "a", "abbr", "abbrevation": abbreviation for the label "n", "num", "number": class number of the label (in accordance with the settings of the offical class map) Returns ------- label: str, classifying label of the record """ header = wfdb.rdheader(self._get_path(rec)) label = header.comments[0] if fmt.lower() in ["a", "abbr", "abbreviation"]: label = self._labels_f2a[label] elif fmt.lower() in ["n", "num", "number"]: label = self._labels_f2n[label] elif not fmt.lower() in ["f", "fullname"]: raise ValueError(f"format `{fmt}` of labels is not supported!") return label
def show_table(self): # 这么多行 self.timer.stop() self.bottom_layout.setCurrentIndex(4) rows = self.patient for row in range(0, rows): item = QTableWidgetItem(str(100 + row)) self.patient_table.setItem(row, 0, item) head = wfdb.rdheader('MIT-BIH/mit-bih-database/' + str(100 + row)) age, gender, _, _, _ = head.comments[0].split(" ") item = QTableWidgetItem(str(age)) self.patient_table.setItem(row, 1, item) item = QTableWidgetItem(str(gender)) self.patient_table.setItem(row, 2, item) drugs = head.comments[1] item = QTableWidgetItem(str(drugs)) self.patient_table.setItem(row, 3, item) record = wfdb.rdann('MIT-BIH/mit-bih-database/' + str(100 + row), "atr", sampfrom=0, sampto=650000) A, V, F, R, L = 0, 0, 0, 0, 0 for index in record.symbol: if index == 'A': A += 1 if index == "V": V += 1 if index == "F": F += 1 if index == "R": R += 1 if index == "L": L += 1 item = QTableWidgetItem(str(A)) self.patient_table.setItem(row, 4, item) item = QTableWidgetItem(str(V)) self.patient_table.setItem(row, 5, item) item = QTableWidgetItem(str(F)) self.patient_table.setItem(row, 6, item) item = QTableWidgetItem(str(R)) self.patient_table.setItem(row, 7, item) item = QTableWidgetItem(str(L)) self.patient_table.setItem(row, 8, item) self.patient_table.resizeColumnsToContents()
def _aggregate_stats(self) -> NoReturn: """ finished, checked, aggregate stats on the whole dataset """ stats_file = "stats.csv" stats_file_fp = os.path.join(self.db_dir_base, stats_file) if os.path.isfile(stats_file_fp): self._stats = pd.read_csv(stats_file_fp) if self._stats.empty or set(self._stats_columns) != set( self._stats.columns): print( "Please wait patiently to let the reader aggregate statistics on the whole dataset..." ) start = time.time() self._stats = pd.DataFrame(self.all_records, columns=[ "record" ]) # use self.all_records to ensure it's computed self._stats["tranche"] = self._stats["record"].apply( lambda s: self._all_records_inv[s]) self._stats["subject_id"] = self._stats["record"].apply( lambda s: int(s.split("_")[1])) self._stats["record_id"] = self._stats["record"].apply( lambda s: int(s.split("_")[2])) self._stats["label"] = self._stats["record"].apply( lambda s: self.load_label(s)) self._stats["fs"] = self.fs self._stats["sig_len"] = self._stats["record"].apply( lambda s: wfdb.rdheader(self._get_path(s)).sig_len) self._stats[ "sig_len_sec"] = self._stats["sig_len"] / self._stats["fs"] self._stats["revised"] = self._stats["record"].apply( lambda s: 1 if s in self.__revised_records else 0) self._stats = self._stats.sort_values( by=["subject_id", "record_id"], ignore_index=True) self._stats = self._stats[self._stats_columns] self._stats.to_csv(stats_file_fp, index=False) print(f"Done in {time.time() - start:.5f} seconds!") else: pass # currently no need to parse the loaded csv file self.__all_records = self._stats["record"].tolist()
def testSingleEpoch(self, i): random.shuffle(self.testList) ers = [] numcorrect = 0 numHealthyCorrect = 0 for rec in self.testList: #print('testing ' + rec) record = wfdb.rdheader('../ptbdb/' + rec) (sqerr, correct, healthyCorrect) = self.error(rec, record) ers.append(sqerr) if correct: numcorrect += 1 if healthyCorrect: numHealthyCorrect += 1 totalsqerr = math.sqrt(sum(ers)) correctpercent = float(numcorrect) / len(self.testList) correctHealth = float(numHealthyCorrect) / len(self.testList) print('square mean error: %.4f' % totalsqerr) print('correct predictions: %.4f' % correctpercent) print('correct healthy prediction: %.4f' % correctHealth) self.testfile.write( str(i) + "," + str(totalsqerr) + "," + str(correctpercent) + "," + str(correctHealth) + "\n")
def load_ann(self, rec:str, sampfrom:Optional[int]=None, sampto:Optional[int]=None, fmt:str="interval", keep_original:bool=False) -> Union[Dict[str, list], np.ndarray]: """ finished, checked, load annotations (header) stored in the .hea files Parameters: ----------- rec: str, name of the record sampfrom: int, optional, start index of the annotations to be loaded sampto: int, optional, end index of the annotations to be loaded fmt: str, default "interval", case insensitive, format of returned annotation, can also be "mask" keep_original: bool, default False, if True, in the "interval" `fmt`, intervals (in the form [a,b]) will keep the same with the annotation file otherwise subtract `sampfrom` if specified Returns: -------- ann, dict or ndarray, the annotations in the format of intervals, or in the format of mask """ fp = os.path.join(self.db_dir, rec) wfdb_ann = wfdb.rdann(fp, extension=self.ann_ext) header = wfdb.rdheader(fp) sig_len = header.sig_len sf = sampfrom or 0 st = sampto or sig_len assert st > sf, "`sampto` should be greater than `sampfrom`!" ann = ED({k:[] for k in self.class_map.keys()}) critical_points = wfdb_ann.sample.tolist() + [sig_len] aux_note = wfdb_ann.aux_note if aux_note[0] == "(N": # ref. the doc string of the class critical_points[0] = 0 else: critical_points.insert(0, 0) aux_note.insert(0, "(N") for idx, rhythm in enumerate(aux_note): ann[rhythm.replace("(", "")].append([critical_points[idx], critical_points[idx+1]]) ann = ED({ k: generalized_intervals_intersection(l_itv, [[sf,st]]) \ for k, l_itv in ann.items() }) if fmt.lower() == "mask": tmp = deepcopy(ann) ann = np.full(shape=(st-sf,), fill_value=self.class_map.N, dtype=int) for rhythm, l_itv in tmp.items(): for itv in l_itv: ann[itv[0]-sf: itv[1]-sf] = self.class_map[rhythm] elif not keep_original: for k, l_itv in ann.items(): ann[k] = [[itv[0]-sf, itv[1]-sf] for itv in l_itv] return ann
def convert_mimic_matched (filename, samp_end = None, all_labs=True, all_notes=True): ''' TODO: time filter additional demographics mapping function ''' # all_labs - include labs from outside the time range of the specified signal file # all_notes - include labs from outside the time range of the specified signal file # samp_end - to limit size of datafile for testing, default = None # use base file, pull numerics if filename[-1] != 'n': print('Base is waveform file, add numerics') else: print('Base is numerics file, change basename to waveform and then process numerics') filename = filename[:-1] # generate output filename outfile = 'mimic_test.h5' # read header record = wfdb.rdheader(filename) # meta_head = wfdb_head_meta(filename) with h5py.File(outfile, 'w') as f: # meta = f.require_group('.meta') # meta.attrs['data'] = json.dumps(meta_head, indent = 4) # meta.attrs['mapping'] = json.dumps('Placeholder', indent = 4) grp_numerics = f.require_group('numerics') root = f['/'] print('Converting numerics') record = wfdb.rdrecord(filename+'n', sampfrom = 0, sampto = samp_end ) df = pd.DataFrame(data = record.p_signal, columns = record.sig_name) ds_num = grp_numerics.create_dataset('vitals', maxshape = (None,), data = df.to_records(index=False), compression="gzip", compression_opts=9, shuffle = True) grp_waveforms = f.require_group('/waveforms') print('Converting waveforms') record = wfdb.rdrecord(filename, sampfrom = 0, sampto = samp_end ) df = pd.DataFrame(data = record.p_signal, columns = record.sig_name) ds_wave = grp_waveforms.create_dataset('hemodynamics', maxshape = (None,), data = df.to_records(index=False), compression="gzip", compression_opts=9, shuffle = True) grp_clinical = f.require_group('/clinical') #demographics print('Locating admission') (subj_id, hadm_id, age, gender, ethnicity, diagnosis, expired, death_time) = find_admission(filename) demographics = { 'Age' : age, 'Ethnicity' : ethnicity, 'Gender' : gender, 'Expired' : expired, 'Death_time': death_time } root.attrs['demographics'] = json.dumps(demographics, indent = 4) grp_clinical.attrs['admit_diagnosis'] = json.dumps(diagnosis) # add additional codes from dianosis table #get additional diagnoses - save as dict dx_list = get_diagnoses(hadm_id) grp_clinical.attrs['diagnoses'] = json.dumps(dx_list, indent = 4) # reseach = f.require_group('/Research') #convert numerics #convert waveforms print ('Extracting labs') labs = extract_labs(filename) write_labs(labs, outfile) print ('Extracting notes') notes = extract_notes(filename) write_notes(notes, outfile)
def load_af_episodes( self, rec: str, ann: Optional[wfdb.Annotation] = None, sampfrom: Optional[int] = None, sampto: Optional[int] = None, zero_start: bool = False, fs: Optional[Real] = None, fmt: str = "intervals") -> Union[List[List[int]], np.ndarray]: """ finished, checked, load the episodes of atrial fibrillation, in terms of intervals or mask Parameters ---------- rec: str, name of the record ann: Annotation, optional, the wfdb Annotation of the record, if None, corresponding annotation file will be read sampfrom: int, optional, start index of the data to be loaded, not used when `fmt` is "c_intervals" sampto: int, optional, end index of the data to be loaded, not used when `fmt` is "c_intervals" zero_start: bool, default False, if True, (relative) start index is zero, otherwise, (relative) start index is `sampfrom`, works only when `sampfrom` is positive and `fmt` is not "c_intervals" fs: real number, optional, if not None, positions of the loaded intervals or mask will be ajusted according to this sampling frequency fmt: str, default "intervals", format of the episodes of atrial fibrillation, can be one of "intervals", "mask", "c_intervals" Returns ------- af_episodes: list or ndarray, episodes of atrial fibrillation, in terms of intervals or mask """ header = wfdb.rdheader(self._get_path(rec)) label = self._labels_f2a[header.comments[0]] siglen = header.sig_len # if ann is None or fmt.lower() in ["c_intervals",]: # _ann = wfdb.rdann(self._get_path(rec), extension=self.ann_ext) # else: # _ann = ann _ann = wfdb.rdann(self._get_path(rec), extension=self.ann_ext) sf, st = self._validate_samp_interval(rec, sampfrom, sampto) aux_note = np.array(_ann.aux_note) critical_points = _ann.sample af_start_inds = np.where((aux_note == "(AFIB") | (aux_note == "(AFL"))[0] # ref. NOTE 3. af_end_inds = np.where(aux_note == "(N")[0] assert len(af_start_inds) == len(af_end_inds), \ "unequal number of af period start indices and af period end indices" if fmt.lower() in [ "c_intervals", ]: if sf > 0 or st < siglen: raise ValueError( f"when `fmt` is `c_intervals`, `sampfrom` and `sampto` should never be used!" ) af_episodes = [[start, end] for start, end in zip(af_start_inds, af_end_inds)] return af_episodes intervals = [] for start, end in zip(af_start_inds, af_end_inds): itv = [critical_points[start], critical_points[end]] intervals.append(itv) intervals = generalized_intervals_intersection(intervals, [[sf, st]]) siglen = st - sf if fs is not None and fs != self.fs: siglen = self._round(siglen * fs / self.fs) sf = self._round(sf * fs / self.fs) if label == "AFf": # ref. NOTE. 1 of the class docstring # the `ann.sample` does not always satify this point after resampling intervals = [[sf, siglen - 1]] else: intervals = [[ self._round(itv[0] * fs / self.fs), self._round(itv[1] * fs / self.fs) ] for itv in intervals] if zero_start: intervals = [[itv[0] - sf, itv[1] - sf] for itv in intervals] sf = 0 af_episodes = intervals if fmt.lower() in [ "mask", ]: mask = np.zeros((siglen, ), dtype=int) for itv in intervals: mask[itv[0] - sf:itv[1] - sf] = 1 af_episodes = mask return af_episodes