def _load_edflib(filename): """load a multi-channel Timeseries from an EDF (European Data Format) file or EDF+ file, using edflib. Args: filename: EDF+ file Returns: Timeseries """ import edflib e = edflib.EdfReader(filename, annotations_mode='all') if np.ptp(e.get_samples_per_signal()) != 0: raise Error('channels have differing numbers of samples') if np.ptp(e.get_signal_freqs()) != 0: raise Error('channels have differing sample rates') n = e.samples_in_file(0) m = e.signals_in_file channelnames = e.get_signal_text_labels() dt = 1.0/e.samplefrequency(0) # EDF files hold <=16 bits of information for each sample. Representing as # double precision (64bit) is unnecessary use of memory. use 32 bit float: ar = np.zeros((n, m), dtype=np.float32) # edflib requires input buffer of float64s buf = np.zeros((n,), dtype=np.float64) for i in range(m): e.read_phys_signal(i, 0, n, buf) ar[:,i] = buf tspan = np.arange(0, (n - 1 + 0.5) * dt, dt, dtype=np.float32) return Timeseries(ar, tspan, labels=[None, channelnames])
def annotations_from_file(filename): """Get a list of event annotations from an EDF (European Data Format file or EDF+ file, using edflib. Args: filename: EDF+ file Returns: list: annotation events, each in the form [start_time, duration, text] """ import edflib e = edflib.EdfReader(filename, annotations_mode='all') return e.read_annotations()
def estimate_emission_probs(train_data_file, label_file, signal_indices, mute=False): """ Estimate the emission probabilities of the hidden states. It assumes that each hidden state produces observations following a Gaussian distribution. :param train_data_file: the file with the training data (.edf) :param label_file: the file with labels for the training data (.csv) :param mute: do not print some messages to screen """ print "Estimating emission probabilities for file %s" % train_data_file # 1) Read EDF file and setup _, _, session_num, _ = parse_filename(label_file) # EDF reader object e = edflib.EdfReader(train_data_file) # Signal numbers # 5 -> EOG Left (freq: 50Hz) # 7 -> EEG_1 (freq: 125Hz) # 8 -> Respiration (freq: 10Hz) # Signal frequencies freqs = get_signal_frequencies(e, signal_indices) signals = get_signals(e, signal_indices) print "Signal frequencies: ", freqs num_epochs = number_of_epochs(signals[0], freqs[0]) print "Number of epochs: %d" % num_epochs # Verify length of signals are consistent if len(signal_indices) > 1: num_epochs_verify = number_of_epochs(signals[1], freqs[1]) assert num_epochs == num_epochs_verify # 2) Load epoch labels from labeled data stages = get_stages_array(label_file) assert len(stages) == num_epochs # 3) Create feature matrices for each hidden state first_w, first_n, first_r = [True for i in range(0, 3)] # For each epoch in the signal (ei: epoch index) for ei in xrange(0, num_epochs): # Data label (Wake, NREM, REM) label = stages[ei] # For each signal first_sig = True for sid in range(0, len(signal_indices)): # Retrieve data in ei-th epoch epoch_data = get_epoch_data(signals[sid], freqs[sid], ei, 1, mute) # Extract the features from the data epoch_feats = get_features(epoch_data, freqs[sid]) # Features is features vector composed of features of many signals stacked together if first_sig is True: features = epoch_feats first_sig = False else: features = np.hstack((features, epoch_feats)) if label == 'W': if mute is False: print "Epoch has label [Wake]" if first_w is True: w_feats = features first_w = False elif first_w is False: w_feats = np.column_stack((w_feats, features)) if label == "N": if mute is False: print "Epoch has label [NREM]" if first_n is True: n_feats = features first_n = False elif first_n is False: n_feats = np.column_stack((n_feats, features)) if label == "R": if mute is False: print "Epoch has label [REM]" if first_r is True: r_feats = features first_r = False elif first_r is False: r_feats = np.column_stack((r_feats, features)) # print w_feats; print n_feats; print r_feats # 4) Compute mean vectors, and covariance matrices for each hidden state # For feature matrix of each hidden state, compute average of all features across all observations mu_w = w_feats.mean(1) mu_n = n_feats.mean(1) mu_r = r_feats.mean(1) # For feature matrix of each hidden state, compute covariance matrix. # Use np.cov, documentation: http://docs.scipy.org/doc/numpy-1.10.1/reference/generated/numpy.cov.html # np.cov(X), where X s.t.: Each row of m represents a variable, and each column a single observation of all those variables. sigma_w = np.cov(w_feats) sigma_n = np.cov(n_feats) sigma_r = np.cov(r_feats) print "Number of features: %d" % len(mu_w) print "Saving parameters (mu and sigma) to './data/params/%s.npz" % session_num np.savez('./data/params/' + session_num, mu_w=mu_w, mu_n=mu_n, mu_r=mu_r, sigma_w=sigma_w, sigma_r=sigma_r, sigma_n=sigma_n) del e return True
def edf2hdf(fn, outfn='', anonymize=False, verbose=False): """ convert an edf file @fn to hdf5 using fairly straightforward mapping if no @outfn is specified, then use the same name as the @fn but change extention to "eeg.h5" return True if successful """ if not outfn: base = os.path.basename(fn) base, ext = os.path.splitext(base) # outfn = os.path.join(hdf_dir, base) outfn = base + '.eeg.h5' # all the data point related stuff with edflib.EdfReader(fn) as ef: # read all EDF+ header information in just the way I want it header = { 'file_name': os.path.basename(fn), 'filetype': ef.filetype, 'patient_name': ef.patient_name, 'patientcode': ef.patientcode, 'gender': ef.gender, 'signals_in_file': ef.signals_in_file, 'datarecords_in_file': ef.datarecords_in_file, 'file_duration_100ns': ef.file_duration_100ns, 'file_duration_seconds': ef.file_duration_seconds, 'startdate_date': datetime.date(ef.startdate_year, ef.startdate_month, ef.startdate_day), 'start_datetime': datetime.datetime(ef.startdate_year, ef.startdate_month, ef.startdate_day, ef.starttime_hour, ef.starttime_minute, ef.starttime_second), 'starttime_subsecond_offset': ef.starttime_subsecond, 'birthdate_date': ef.birthdate_date, 'patient_additional': ef.patient_additional, 'admincode': ef.admincode, # usually the study eg. C13-100 'technician': ef.technician, 'equipment': ef.equipment, 'recording_additional': ef.recording_additional, 'datarecord_duration_100ns': ef.datarecord_duration_100ns, } if verbose: pprint.pprint(header) debug = print else: def nulfunction(*args,**kwargs): return None debug = nulfunction # use arrow start_datetime = header['start_datetime'] # end_date_time = datetime.datetime(ef.enddate_year, ef.enddate_month, ef.enddate_day, ef.endtime_hour, # ef.endtime_minute, ef.endtime_second) # tz naive # end_date_time - start_date_time duration = datetime.timedelta(seconds=header['file_duration_seconds']) # derived information birthdate = header['birthdate_date'] if birthdate: age = arrow.get(start_datetime) - arrow.get(header['birthdate_date']) debug('predicted age: %s' % age) # total_seconds() returns a float debug('predicted age (seconds): %s' % age.total_seconds()) else: age = datetime.timedelta(seconds=0) birthdate = '' if anonymize: raise Exception('not implemented') # anonymized version if necessary header['end_datetime'] = header['start_datetime'] + duration ############# signal array information ################## # signal block related stuff nsigs = ef.signals_in_file # again know/assume that this is uniform sampling across signals # for each record block fs0 = ef.samplefrequency(0) signal_frequency_array = ef.get_signal_freqs() # print("signal_frequency_array::\n", repr(signal_frequency_array)) assert all(signal_frequency_array == fs0) nsamples0 = ef.samples_in_file(0) # samples per channel debug('nsigs=%s, fs0=%s, nsamples0=%s\n' % (nsigs, fs0, nsamples0)) num_samples_per_signal = ef.get_samples_per_signal() # np array # print("num_samples_per_signal::\n", repr(num_samples_per_signal), '\n') assert all(num_samples_per_signal == nsamples0) file_duration_sec = ef.file_duration_seconds #print("file_duration_sec", repr(file_duration_sec)) # Note that all annotations except the top row must also specify a duration. # long long onset; /* onset time of the event, expressed in units of 100 # nanoSeconds and relative to the starttime in the header */ # char duration[16]; /* duration time, this is a null-terminated ASCII text-string */ # char annotation[EDFLIB_MAX_ANNOTATION_LEN + 1]; /* description of the # event in UTF-8, this is a null term string of max length 512*/ # start("x.y"), end, char[20] # annotations = ef.read_annotations_as_array() # get numpy array of # annotations annotations_b = ef.read_annotations_b_100ns_units() # print("annotations_b::\n") # pprint.pprint(annotations_b) # get list of annotations signal_text_labels = ef.get_signal_text_labels() debug("signal_text_labels::\n") if verbose: pprint.pprint(signal_text_labels) # ef.recording_additional # print() signal_digital_mins = np.array( [ef.digital_min(ch) for ch in range(nsigs)]) signal_digital_total_min = min(signal_digital_mins) #print("digital mins:", repr(signal_digital_mins)) #print("digital total min:", repr(signal_digital_total_min)) signal_digital_maxs = np.array( [ef.digital_max(ch) for ch in range(nsigs)]) signal_digital_total_max = max(signal_digital_maxs) #print("digital maxs:", repr(signal_digital_maxs)) #print("digital total max:", repr(signal_digital_total_max)) signal_physical_dims = [ ef.physical_dimension(ch) for ch in range(nsigs)] # print('signal_physical_dims::\n') # pprint.pprint(signal_physical_dims) #print() signal_physical_maxs = np.array( [ef.physical_max(ch) for ch in range(nsigs)]) #print('signal_physical_maxs::\n', repr(signal_physical_maxs)) signal_physical_mins = np.array( [ef.physical_min(ch) for ch in range(nsigs)]) #print('signal_physical_mins::\n', repr(signal_physical_mins)) # this don't seem to be used much so I will put at end signal_prefilters = [ef.prefilter(ch).strip() for ch in range(nsigs)] #print('signal_prefilters::\n') # pprint.pprint(signal_prefilters) #print() signal_transducers = [ef.transducer(ch).strip() for ch in range(nsigs)] #print('signal_transducers::\n') #pprint.pprint(signal_transducers) with eeghdf.EEGHDFWriter(outfn, 'w') as eegf: if header['birthdate_date']: birthdate_isostring = header['birthdate_date'].strftime('%Y-%m-%d') else: birthdate_isostring = '' eegf.write_patient_info(patient_name=header['patient_name'], patientcode=header['patientcode'], gender=header['gender'], birthdate_isostring=birthdate_isostring, # gestational_age_at_birth_days # born_premature patient_additional=header['patient_additional']) rec = eegf.create_record_block(record_duration_seconds=header['file_duration_seconds'], start_isodatetime=str(header['start_datetime']), end_isodatetime=str(header['end_datetime']), number_channels=header['signals_in_file'], num_samples_per_channel=nsamples0, sample_frequency=fs0, signal_labels=signal_text_labels, signal_physical_mins=signal_physical_mins, signal_physical_maxs=signal_physical_maxs, signal_digital_mins=signal_digital_mins, signal_digital_maxs=signal_digital_maxs, physical_dimensions=signal_physical_dims, patient_age_days=age.total_seconds() / 86400.0, signal_prefilters=signal_prefilters, signal_transducers=signal_transducers, technician=header['technician']) eegf.write_annotations_b(annotations_b) # may be should be called record annotations edfblock_itr = edf_block_iter_generator( ef, nsamples0, 100 * ef.samples_in_datarecord(0)*header['signals_in_file'], # samples_per_chunk roughly 100 datarecords at a time dtype='int32') signals = eegf.stream_dig_signal_to_record_block(rec, edfblock_itr) return True # we succeeded
def edf2hdf(fn, outfn="", hdf_dir="", anonymize=False): """ convert an edf file to hdf5 using fairly straightforward mapping return True if successful by default (if outfn and hdf_dir are not set) the output is put in the same directory as the input file you can also specify the output file (full path) by setting outfn directly or simple specify a different target directory by specifying @hdf_dir as a directory path @database_sourcel_label tells us which database it came from LPCH_NK or STANFORD_NK this is important! """ if not outfn: parentdir = os.path.dirname(fn) base = os.path.basename(fn) base, ext = os.path.splitext(base) base = base + DEFAULT_EXT if hdf_dir: outfn = os.path.join(hdf_dir, base) else: outfn = os.path.join(parentdir, base) # debug('outfn:', outfn) # all the data point related stuff with edflib.EdfReader(fn) as ef: # read all EDF+ header information in just the way I want it header = { "file_name": os.path.basename(fn), "filetype": ef.filetype, "patient_name": ef.patient_name, "patientcode": ef.patientcode, "studyadmincode": ef.admincode, "gender": ef.gender, "signals_in_file": ef.signals_in_file, "datarecords_in_file": ef.datarecords_in_file, "file_duration_100ns": ef.file_duration_100ns, "file_duration_seconds": ef.file_duration_seconds, "startdate_date": datetime.date(ef.startdate_year, ef.startdate_month, ef.startdate_day), "start_datetime": datetime.datetime( ef.startdate_year, ef.startdate_month, ef.startdate_day, ef.starttime_hour, ef.starttime_minute, ef.starttime_second, ), "starttime_subsecond_offset": ef.starttime_subsecond, "birthdate_date": ef.birthdate_date, # str "patient_additional": ef.patient_additional, # str "admincode": ef.admincode, # usually the study eg. C13-100 "technician": ef.technician, "equipment": ef.equipment, "recording_additional": ef.recording_additional, "datarecord_duration_100ns": ef.datarecord_duration_100ns, } # defbug debug("original header") debug(pprint.pformat(header)) # use arrow start_datetime = header["start_datetime"] duration = datetime.timedelta(seconds=header["file_duration_seconds"]) # derived information birthdate = header["birthdate_date"] if birthdate: age = arrow.get(start_datetime) - arrow.get( header["birthdate_date"]) debug("predicted age: %s" % age) # total_seconds() returns a float debug("predicted age (seconds): %s" % age.total_seconds()) else: age = datetime.timedelta(seconds=0) if anonymize: anonymous_header = create_simple_anonymous_header(header) header = anonymous_header header["end_datetime"] = header["start_datetime"] + duration ############# signal array information ################## # signal block related stuff nsigs = ef.signals_in_file # again know/assume that this is uniform sampling across signals fs0 = ef.samplefrequency(0) signal_frequency_array = ef.get_signal_freqs() dfs = np.diff(signal_frequency_array) dfs_ind = np.where(dfs != 0.0) dfs_ind = dfs_ind[0] last_ind = 0 for dd in dfs_ind + 1: debug("block:", signal_frequency_array[last_ind:dd]) last_ind = dd debug("last block:", signal_frequency_array[last_ind:]) debug("where does sampling rate change?", np.where(dfs != 0.0)) debug("elements:", signal_frequency_array[np.where(dfs != 0.0)]) debug("signal_frequency_array::\n", repr(signal_frequency_array)) debug("len(signal_frequency_array):", len(signal_frequency_array)) assert all(signal_frequency_array[:-3] == fs0) nsamples0 = ef.samples_in_file(0) # samples per channel debug("nsigs=%s, fs0=%s, nsamples0=%s\n" % (nsigs, fs0, nsamples0)) num_samples_per_signal = ef.get_samples_per_signal() # np array debug("num_samples_per_signal::\n", repr(num_samples_per_signal), "\n") # assert all(num_samples_per_signal == nsamples0) file_duration_sec = ef.file_duration_seconds # debug("file_duration_sec", repr(file_duration_sec)) # Note that all annotations except the top row must also specify a duration. # long long onset; /* onset time of the event, expressed in units of 100 # nanoSeconds and relative to the starttime in the header */ # char duration[16]; /* duration time, this is a null-terminated ASCII text-string */ # char annotation[EDFLIB_MAX_ANNOTATION_LEN + 1]; /* description of the # event in UTF-8, this is a null term string of max length 512*/ # start("x.y"), end, char[20] # annotations = ef.read_annotations_as_array() # get numpy array of # annotations annotations_b = ef.read_annotations_b_100ns_units() # debug("annotations_b::\n") # pprint.pprint(annotations_b) # get list of annotations signal_text_labels = ef.get_signal_text_labels() debug("signal_text_labels::\n") debug(pprint.pformat(signal_text_labels)) debug("normalized text labels::\n") signal_text_labels_lpch_normalized = [ normalize_lpch_signal_label(label) for label in signal_text_labels ] debug(pprint.pformat(signal_text_labels_lpch_normalized)) # ef.recording_additional # debug() signal_digital_mins = np.array( [ef.digital_min(ch) for ch in range(nsigs)]) signal_digital_total_min = min(signal_digital_mins) debug("digital mins:", repr(signal_digital_mins)) debug("digital total min:", repr(signal_digital_total_min)) signal_digital_maxs = np.array( [ef.digital_max(ch) for ch in range(nsigs)]) signal_digital_total_max = max(signal_digital_maxs) debug("digital maxs:", repr(signal_digital_maxs)) # debug("digital total max:", repr(signal_digital_total_max)) signal_physical_dims = [ ef.physical_dimension(ch) for ch in range(nsigs) ] # debug('signal_physical_dims::\n') # pprint.pformat(signal_physical_dims) # debug() signal_physical_maxs = np.array( [ef.physical_max(ch) for ch in range(nsigs)]) # debug('signal_physical_maxs::\n', repr(signal_physical_maxs)) signal_physical_mins = np.array( [ef.physical_min(ch) for ch in range(nsigs)]) # debug('signal_physical_mins::\n', repr(signal_physical_mins)) # this don't seem to be used much so I will put at end signal_prefilters = [ef.prefilter(ch).strip() for ch in range(nsigs)] # debug('signal_prefilters::\n') # pprint.pformat(signal_prefilters) # debug() signal_transducers = [ef.transducer(ch).strip() for ch in range(nsigs)] # debug('signal_transducers::\n') # pprint.pformat(signal_transducers) with eeghdf.EEGHDFWriter(outfn, "w") as eegf: eegf.write_patient_info( patient_name=header["patient_name"], patientcode=header["patientcode"], gender=header["gender"], birthdate_isostring=str(header["birthdate_date"]), # gestational_age_at_birth_days # born_premature patient_additional=header["patient_additional"], ) signal_text_labels_lpch_normalized = [ normalize_lpch_signal_label(label) for label in signal_text_labels ] rec = eegf.create_record_block( record_duration_seconds=header["file_duration_seconds"], start_isodatetime=str(header["start_datetime"]), end_isodatetime=str(header["end_datetime"]), number_channels=header["signals_in_file"], num_samples_per_channel=nsamples0, sample_frequency=fs0, signal_labels=signal_text_labels_lpch_normalized, signal_physical_mins=signal_physical_mins, signal_physical_maxs=signal_physical_maxs, signal_digital_mins=signal_digital_mins, signal_digital_maxs=signal_digital_maxs, physical_dimensions=signal_physical_dims, patient_age_days=age.total_seconds() / 86400.0, signal_prefilters=signal_prefilters, signal_transducers=signal_transducers, technician=header["technician"], studyadmincode=header["studyadmincode"], ) eegf.write_annotations_b( annotations_b) # may be should be called record annotations edfblock_itr = edf_block_iter_generator( ef, nsamples0, 100 * ef.samples_in_datarecord(0) * header[ "signals_in_file"], # samples_per_chunk roughly 100 datarecords at a time dtype="int32", ) signals = eegf.stream_dig_signal_to_record_block(rec, edfblock_itr) return True
def edf2h5_float32(fn, outfn="", hdf_dir="", anonymous=False): """ convert an edf file to hdf5 using a straighforward mapping convert to real-valued signals store as float32's justing getting started here --- metadata --- number_signals sample_frequency nsamples age signal_labels Post Menstrual Age """ if not outfn: base = os.path.basename(fn) base, ext = os.path.splitext(base) base = base + DEFAULT_EXT outfn = os.path.join(hdf_dir, base) debug("outfn:", outfn) with edflib.EdfReader(fn) as ef: nsigs = ef.signals_in_file # again know/assume that this is uniform sampling across signals fs = [ef.samplefrequency(ii) for ii in range(nsigs)] fs0 = fs[0] if any([fs0 != xx for xx in fs]): print("error caught multiple sampling frquencies in edf files!!!") sys.exit(0) nsamples0 = ef.samples_in_file(0) debug("nsigs=%s, fs0=%s, nsamples0=%s" % (nsigs, fs0, nsamples0)) # create file 'w-' -> fail if exists , w -> truncate if exists hdf = h5py.File(outfn, "w") # use compression? yes! give it a try eegdata = hdf.create_dataset( "eeg", (nsigs, nsamples0), dtype="float32", # chunks=(nsigs,fs0), chunks=True, fletcher32=True, # compression='gzip', # compression='lzf', # maxshape=(256,None) ) # no compression -> 50 MiB can view eegdata in vitables # compression='gzip' -> 27 MiB slower # compression='lzf' -> 35 MiB # compression='lzf' maxshape=(256,None) -> 36MiB # szip is unavailable patient = hdf.create_group("patient") # add meta data hdf.attrs["number_signals"] = nsigs hdf.attrs["sample_frequency"] = fs0 hdf.attrs["nsamples0"] = nsamples0 patient.attrs["gender_b"] = ef.gender_b patient.attrs["patientname"] = ef.patient_name # PHI debug("birthdate: %s" % ef.birthdate_b, type(ef.birthdate_b)) # this is a string -> date (datetime) if not ef.birthdate_b: debug("no birthday in this file") birthdate = None else: birthdate = dateutil.parser.parse(ef.birthdate_b) debug("birthdate (date object):", birthdate_b) start_date_time = datetime.datetime( ef.startdate_year, ef.startdate_month, ef.startdate_day, ef.starttime_hour, ef.starttime_minute, ef.starttime_second, ) # ,tzinfo=dateutil.tz.tzlocal()) debug(start_date_time) if start_date_time and birthdate: age = start_date_time - birthdate debug("age:", age) else: age = None if age: patient.attrs["post_natal_age_days"] = age.days else: patient.attrs["post_natal_age_days"] = -1 # now start storing the lists of things: labels, units... # nsigs = len(label_list) # variable ascii string (or b'' type) str_dt = h5py.special_dtype(vlen=str) label_ds = hdf.create_dataset("signal_labels", (nsigs, ), dtype=str_dt) units_ds = hdf.create_dataset("signal_units", (nsigs, ), dtype=str_dt) labels = [] units = list() # signal_nsamples = [] for ii in range(nsigs): labels.append(ef.signal_label(ii)) units.append(ef.physical_dimension(ii)) # self.signal_nsamples.append(self.cedf.samples_in_file(ii)) # self.samplefreqs.append(self.cedf.samplefrequency(ii)) # eegdata.signal_labels = labels # labels are fixed length strings labels_strip = [ss.strip() for ss in labels] label_ds[:] = labels_strip units_ds[:] = units # should be more and a switch for anonymous or not # need to change this to nchunks = int(nsamples0 // fs0) samples_per_chunk = int(fs0) buf = np.zeros((nsigs, samples_per_chunk), dtype="float64") # buffer is float64_t debug("nchunks: ", nchunks, "samples_per_chunk:", samples_per_chunk) bookmark = 0 # mark where were are in samples for ii in range(nchunks): for jj in range(nsigs): # readsignal(self, signalnum, start, n, # np.ndarray[np.float64_t, ndim = 1] sigbuf) # read_phys_signal(chn, 0, nsamples[chn], v) # read_phys_signal(self, signalnum, start, n, np.ndarray[np.float64_t, ndim=1] sigbuf) debug(ii, jj) ef.read_phys_signal(jj, bookmark, samples_per_chunk, buf[jj]) # readsignal converts into float # conversion from float64 to float32 eegdata[:, bookmark:bookmark + samples_per_chunk] = buf # bookmark should be ii*fs0 bookmark += samples_per_chunk left_over_samples = nsamples0 - nchunks * samples_per_chunk debug("left_over_samples:", left_over_samples) if left_over_samples > 0: for jj in range(nsigs): ef.read_phys_signal(jj, bookmark, left_over_samples, buf[jj]) eegdata[:, bookmark:bookmark + left_over_samples] = buf[:, 0:left_over_samples] hdf.close()
#!/usr/bin/python import sys import numpy as np import matplotlib matplotlib.use('Agg'); # use a non-interactive renderer import matplotlib.pyplot as plt import edflib if len(sys.argv) != 4 or sys.argv[2] != "-o": print("usage:\n\t{0} datafile.edf -o plotfile.png\n".format(sys.argv[0])); else: # load the data signal_num = 0 # TODO: allow this to be specified on the command line! edf = edflib.EdfReader(sys.argv[1]) data = edf.readSignal(signal_num) x_bin_size = 8192 # plot the data plt.figure(1, figsize=(8, 3)) plt.specgram(data, Fs=edf.samplefrequency(signal_num), NFFT=x_bin_size, noverlap=x_bin_size/2) plt.xlim(0, data.shape[0]/edf.samplefrequency(signal_num)) plt.ylim(0, edf.samplefrequency(signal_num)/2) plt.title("Spectrogram of {0}".format(sys.argv[1])); plt.xlabel("Time (seconds)") plt.ylabel("Frequency (Hz)") plt.tight_layout() plt.savefig(sys.argv[3], dpi=300)
# now let's see how it looks with edf files import edflib import os.path # In[47]: TUH_SZ_ROOT = '/mnt/data1/eegdbs/TUH/temple/tuh-sz-v1.2.0/v1.2.0' tuhedf_fn = os.path.join( TUH_SZ_ROOT, 'eval/01_tcp_ar/00006059/s003_2012_05_25/00006059_s003_t000.edf') tuhedf_fn = '../../eeg-hdfstorage/data/00000115_s07_a01.edf' # In[48]: print(tuhedf_fn) ef = edflib.EdfReader(tuhedf_fn) # In[49]: N = 27 [ef.samplefrequency(ch) for ch in range(N)] # In[50]: ef.get_signal_text_labels() # In[51]: ef.get_samples_per_signal() # In[52]:
def predict_labels(test_data_file, model_session_num, signal_indices, mute): # 1) Get data e = edflib.EdfReader(test_data_file) freqs = get_signal_frequencies(e, signal_indices) signals = get_signals(e, signal_indices) print "Signal frequencies: ", freqs num_epochs = number_of_epochs(signals[0], freqs[0]) first_feat = True # Retrieve data in ei-th epoch for ei in xrange(0, num_epochs): # For each signal first_sig = True for sid in range(0, len(signal_indices)): # Retrieve data in ei-th epoch epoch_data = get_epoch_data( signals[sid], freqs[sid], ei, 1, mute ) # Extract the features from the data epoch_feats = get_features( epoch_data, freqs[sid]) # Features is features vector composed of features of many signals stacked together if first_sig is True: features = epoch_feats first_sig = False else: features = np.hstack( (features, epoch_feats) ) if first_feat is True: feat_mat = features first_feat = False else: feat_mat = np.vstack( (feat_mat, features) ) # 2) Construct and train model from hmmlearn import hmm trans_mat_file = './data/transition_matrices/' + model_session_num + '_transitions.npy' print "Loading transition matrix %s" %trans_mat_file params = np.load('./data/params/' + model_session_num + '.npz') print "Loading emission parameters from ./data/params/%s.npz" %model_session_num mu_w = params['mu_w']; mu_n = params['mu_n']; mu_r = params['mu_r']; sigma_w = params['sigma_w']; sigma_n = params['sigma_n']; sigma_r = params['sigma_r']; covar_type = "diag" if covar_type == "diag": # Covariance matrices *must* be diagonal to avoid problems with positive-definite, symmetric requirements size = sigma_w.shape[0] sigma_w = np.multiply( np.identity(size), sigma_w).diagonal() sigma_n = np.multiply( np.identity(size), sigma_n).diagonal() sigma_r = np.multiply( np.identity(size), sigma_r).diagonal() # TODO: set negative values to 0 model = hmm.GaussianHMM(n_components=3, covariance_type=covar_type, n_iter=100) model.means_ = np.array([ mu_w, mu_n, mu_r ]) model.covars_ = np.array([sigma_w, sigma_n, sigma_r]) # State order: W, N, R start_probs = np.array([ 0.6, 0.4, 0.0 ]) assert np.sum(start_probs) == 1 model.startprob_= start_probs model.transmat_ = np.load(trans_mat_file).transpose() # 3) Predict the labels of the feature matrix # model.predict(X) where X: array-like, shape (n_samples, n_features) # Returns Label matrix L where L is array of n_samples labels L = model.predict(feat_mat) return L
i = 0 if isfile(join(foldername, 'annotations_EEGdata.mat')): with h5py.File(join(foldername, 'annotations_EEGdata.mat')) as file: for c in file['annotat_new']: for r in range(len(c)): annotations.append(file[c[r]][()]) i = i + 1 print(annotations) eegdata = [] for file in filelist: fpath = join(foldername,file) p, ext = os.path.splitext(fpath) if(ext == '.edf'): print(fpath + " is a .edf file") edf = edflib.EdfReader(fpath) samples, nSigs = fileinfo(edf) sig1 = np.zeros((samples,nSigs), dtype='int32') buf = np.zeros(samples, dtype='int32') readsignals(edf,samples,sig1,buf) datName = (sig1,p) eegdata.append(datName) print(sig1) else: print(fpath + " is not a .edf file") records, validation = getLeaveOneOut(annotations, eegdata, 0);
def compute_emission_distribution(data_file, label_file): e = edflib.EdfReader(data_file) # Signal numbers # 5 -> EOG Left (freq: 50Hz) # 7 -> EEG_1 (freq: 125Hz) # 8 -> Respiration (freq: 10Hz) signal_indices = [4, 7, 8] # Signal frequencies eogl_f = 125; eeg1_f = 125; resp_f = 10; eogl, eeg1, resp = load_signals(e, signal_indices) num_epochs = number_of_epochs(eeg1, eeg1_f) print "Number of epochs: %d" %num_epochs num_epochs_verify = number_of_epochs(eogl, eogl_f) assert num_epochs == num_epochs_verify # 2) Load epoch labels from labeled data stages = get_stages_array(label_file) assert len(stages) == num_epochs # 3) Create feature matrices for each hidden state first_w, first_n, first_r = [True for i in range(0, 3)] # For each epoch in the signal (ei: epoch index) for ei in xrange(0, num_epochs): # Retrieve data in ei-th epoch e1 = get_epoch_data(eeg1, eeg1_f, ei, 1, mute) # EEG 1 r = get_epoch_data(resp, resp_f, ei, 1, mute) # Respiration eol = get_epoch_data(eogl, eogl_f, ei, 1, mute) # EOG Left # Data label (Wake, NREM, REM) label = stages[ei] # Extract the features from the data e1_features = get_features(e1, eeg1_f) r_features = get_features(r, resp_f) eogl_features = get_features(eol, eogl_f) # Features is features vector composed of features of many signals stacked together features = np.hstack((e1_features, r_features, eogl_features)) if label == 'W': if mute is False: print "Epoch has label [Wake]" if first_w is True: w_feats = features first_w = False elif first_w is False: w_feats = np.column_stack((w_feats, features)) if label == "N": if mute is False: print "Epoch has label [NREM]" if first_n is True: n_feats = features first_n = False elif first_n is False: n_feats = np.column_stack((n_feats, features)) if label == "R": if mute is False: print "Epoch has label [REM]" if first_r is True: r_feats = features first_r = False elif first_r is False: r_feats = np.column_stack((r_feats, features)) # print w_feats; print n_feats; print r_feats
def dump_edf_info(filename): """ test out what all the information looks like in the header for a file """ with edflib.EdfReader(filename) as ef: # all the data point related stuff nsigs = ef.signals_in_file fs0 = ef.samplefrequency( 0 ) # again know/assume that this is uniform sampling across signals nsamples0 = ef.samples_in_file(0) print('nsigs=%s, fs0=%s, nsamples0=%s\n' % (nsigs, fs0, nsamples0)) num_samples_per_signal = ef.get_samples_per_signal() print("num_samples_per_signal::\n", repr(num_samples_per_signal), '\n') file_duration_seconds = ef.file_duration_seconds print("file_duration_seconds", repr(file_duration_seconds)) signal_frequency_array = ef.get_signal_freqs() print("signal_frequency_array::\n", repr(signal_frequency_array)) annotations = ef.read_annotations_b() print("annotations::\n", repr(annotations)) signal_text_labels = ef.get_signal_text_labels() print("signal_text_labels::\n", repr(signal_text_labels)) # ef.recording_additional print() signal_digital_mins = [ef.digital_min(ch) for ch in range(nsigs)] signal_digital_total_min = min(signal_digital_mins) print("digital mins:", repr(signal_digital_mins)) print("digital total min:", repr(signal_digital_total_min)) signal_digital_maxs = [ef.digital_max(ch) for ch in range(nsigs)] signal_digital_total_max = max(signal_digital_maxs) print("digital maxs:", repr(signal_digital_maxs)) print("digital total max:", repr(signal_digital_total_max)) signal_physical_dims = [ ef.physical_dimension(ch) for ch in range(nsigs) ] print('\nsignal_physical_dims::') pprint(signal_physical_dims) signal_physical_maxs = [ef.physical_max(ch) for ch in range(nsigs)] print('\nsignal_physical_maxs::') pprint(signal_physical_maxs) signal_physical_mins = [ef.physical_max(ch) for ch in range(nsigs)] print('\nsignal_physical_mins::') pprint(signal_physical_mins) print('gender:', repr(ef.gender_b)) print('admincode:', repr(ef.admincode)) print('birthdate:', repr(ef.birthdate_b)) # this is a string if ef.birthdate_b: try: birthdate = dateutil.parser.parse(ef.birthdate_b) except ValueError: birthdate = None else: birthdate = None print('birthdate as datetime:', repr(birthdate)) print('equipment:', repr(ef.equipment)) print('patient:', repr(ef.patient)) print('patientname:', repr(ef.patientname)) print('patientcode:', repr(ef.patientcode_b)) print('patient_additional:', repr(ef.patient_additional)) print('recording_additional:', repr(ef.recording_additional)) # or use arrow start_date_time = datetime.datetime(ef.startdate_year, ef.startdate_month, ef.startdate_day, ef.starttime_hour, ef.starttime_minute, ef.starttime_second) # tz naive print('start_date_time:', start_date_time) print() # this don't seem to be used much so I will put at end signal_prefilters = [ef.prefilter(ch) for ch in range(nsigs)] print('signal_prefilters::\n') pprint(signal_prefilters) signal_transducer = [ef.transducer(ch) for ch in range(nsigs)] print('signal_transducer::\n') pprint(signal_transducer)
def edf2hdf2(fn, outfn='', hdf_dir='', anonymize=False): """ convert an edf file to hdf5 using fairly straightforward mapping return True if successful @database_sourcel_label tells us which database it came from LPCH_NK or STANFORD_NK this is important! """ if not outfn: dir_name = os.path.dirname(fn) if not hdf_dir: hdf_dir = dir_name base = os.path.basename(fn) base, ext = os.path.splitext(base) base = base + '.eeg.h5' outfn = os.path.join(hdf_dir, base) # print('outfn:', outfn) # all the data point related stuff with edflib.EdfReader(fn) as ef: # read all EDF+ header information in just the way I want it header = { 'file_name': os.path.basename(fn), 'filetype': ef.filetype, 'patient_name': ef.patient_name, 'patientcode': ef.patientcode, 'studyadmincode': ef.admincode, 'gender': ef.gender, 'signals_in_file': ef.signals_in_file, 'datarecords_in_file': ef.datarecords_in_file, 'file_duration_100ns': ef.file_duration_100ns, 'file_duration_seconds': ef.file_duration_seconds, 'startdate_date': datetime.date(ef.startdate_year, ef.startdate_month, ef.startdate_day), 'start_datetime': datetime.datetime(ef.startdate_year, ef.startdate_month, ef.startdate_day, ef.starttime_hour, ef.starttime_minute, ef.starttime_second), 'starttime_subsecond_offset': ef.starttime_subsecond, 'birthdate_date': ef.birthdate_date, 'patient_additional': ef.patient_additional, 'admincode': ef.admincode, # usually the study eg. C13-100 'technician': ef.technician, 'equipment': ef.equipment, 'recording_additional': ef.recording_additional, 'datarecord_duration_100ns': ef.datarecord_duration_100ns, } pprint.pprint(header) #### validation code ##### validator = None # if source_database_label=='LPCH_NK': # validator = ValidateTrackHeaderLPCH(header=header) # elif source_database_label== 'STANFORD_NK': # validator = ValidateTrackHeaderStanford(header=header) # else: # raise ValidationError # if not validator.is_valid(): # print('problem with this file:', fn) # print(validator.errors,validator.error_code, # validator.error_params) # return False, validator # else: # print('\nvalid header::') # pprint.pprint(validator.cleaned_data) # header = validator.cleaned_data # from here on the header is valid and cleaned # use arrow start_datetime = header['start_datetime'] # end_date_time = datetime.datetime(ef.enddate_year, ef.enddate_month, ef.enddate_day, ef.endtime_hour, # ef.endtime_minute, ef.endtime_second) # tz naive # end_date_time - start_date_time duration = datetime.timedelta(seconds=header['file_duration_seconds']) # derived information birthdate = header['birthdate_date'] if birthdate: age = arrow.get(start_datetime) - arrow.get( header['birthdate_date']) debug('predicted age: %s' % age) # total_seconds() returns a float debug('predicted age (seconds): %s' % age.total_seconds()) else: age = datetime.timedelta(seconds=0) # if anonymize: # if source_database_label== 'LPCH_NK': # anonymizer = AnonymizeTrackHeaderLPCH(header, source_database_label=source_database_label) # if source_database_label == 'STANFORD_NK': # anonymizer = AnonymizeTrackHeaderStanford(header, source_database_label=source_database_label) # header = anonymizer.anonymous_header # replace the original header with the anonymous one # print('anonymized header') # pprint.pprint(header) # anonymized version if necessary header['end_datetime'] = header['start_datetime'] + duration ############# signal array information ################## # signal block related stuff nsigs = ef.signals_in_file # again know/assume that this is uniform sampling across signals fs0 = ef.samplefrequency(0) signal_frequency_array = ef.get_signal_freqs() dfs = np.diff(signal_frequency_array) dfs_ind = np.where(dfs != 0.0) dfs_ind = dfs_ind[0] last_ind = 0 for dd in dfs_ind + 1: print("block:", signal_frequency_array[last_ind:dd]) last_ind = dd print("last block:", signal_frequency_array[last_ind:]) print("where does sampling rate change?", np.where(dfs != 0.0)) print("elements:", signal_frequency_array[np.where(dfs != 0.0)]) print("signal_frequency_array::\n", repr(signal_frequency_array)) print("len(signal_frequency_array):", len(signal_frequency_array)) assert all(signal_frequency_array[:-3] == fs0) nsamples0 = ef.samples_in_file(0) # samples per channel print('nsigs=%s, fs0=%s, nsamples0=%s\n' % (nsigs, fs0, nsamples0)) num_samples_per_signal = ef.get_samples_per_signal() # np array print("num_samples_per_signal::\n", repr(num_samples_per_signal), '\n') # assert all(num_samples_per_signal == nsamples0) file_duration_sec = ef.file_duration_seconds #print("file_duration_sec", repr(file_duration_sec)) # Note that all annotations except the top row must also specify a duration. # long long onset; /* onset time of the event, expressed in units of 100 # nanoSeconds and relative to the starttime in the header */ # char duration[16]; /* duration time, this is a null-terminated ASCII text-string */ # char annotation[EDFLIB_MAX_ANNOTATION_LEN + 1]; /* description of the # event in UTF-8, this is a null term string of max length 512*/ # start("x.y"), end, char[20] # annotations = ef.read_annotations_as_array() # get numpy array of # annotations annotations_b = ef.read_annotations_b_100ns_units() # print("annotations_b::\n") # pprint.pprint(annotations_b) # get list of annotations signal_text_labels = ef.get_signal_text_labels() print("signal_text_labels::\n") pprint.pprint(signal_text_labels) print("normalized text labels::\n") signal_text_labels_lpch_normalized = [ normalize_lpch_signal_label(label) for label in signal_text_labels ] pprint.pprint(signal_text_labels_lpch_normalized) # ef.recording_additional # print() signal_digital_mins = np.array( [ef.digital_min(ch) for ch in range(nsigs)]) signal_digital_total_min = min(signal_digital_mins) print("digital mins:", repr(signal_digital_mins)) print("digital total min:", repr(signal_digital_total_min)) signal_digital_maxs = np.array( [ef.digital_max(ch) for ch in range(nsigs)]) signal_digital_total_max = max(signal_digital_maxs) print("digital maxs:", repr(signal_digital_maxs)) #print("digital total max:", repr(signal_digital_total_max)) signal_physical_dims = [ ef.physical_dimension(ch) for ch in range(nsigs) ] # print('signal_physical_dims::\n') # pprint.pprint(signal_physical_dims) #print() signal_physical_maxs = np.array( [ef.physical_max(ch) for ch in range(nsigs)]) #print('signal_physical_maxs::\n', repr(signal_physical_maxs)) signal_physical_mins = np.array( [ef.physical_min(ch) for ch in range(nsigs)]) #print('signal_physical_mins::\n', repr(signal_physical_mins)) # this don't seem to be used much so I will put at end signal_prefilters = [ef.prefilter(ch).strip() for ch in range(nsigs)] #print('signal_prefilters::\n') # pprint.pprint(signal_prefilters) #print() signal_transducers = [ef.transducer(ch).strip() for ch in range(nsigs)] #print('signal_transducers::\n') #pprint.pprint(signal_transducers) if not header[ 'patient_name']: # iassume is "EDF" file so need to split patient info try: s = ef.patient print('local subject:', s) patientcode, gender, dob, name, age_str = s.split( ) # the age_str seems illegal per edfplus print("try this:") header['patient_name'] = name header['birthdate_date'] = dob header['gender'] = gender header['patient_additional'] = age_str header['patientcode'] = patientcode except: pass try: s = ef.patient ss = s.split() if len(ss) > 0: header['patient_name'] = ss[0] header['patient_additional'] = s except: pass with eeghdf.EEGHDFWriter(outfn, 'w') as eegf: eegf.write_patient_info( patient_name=header['patient_name'], patientcode=header['patientcode'], gender=header['gender'], birthdate_isostring=header['birthdate_date'], # gestational_age_at_birth_days # born_premature patient_additional=header['patient_additional']) signal_text_labels_lpch_normalized = [ normalize_lpch_signal_label(label) for label in signal_text_labels ] rec = eegf.create_record_block( record_duration_seconds=header['file_duration_seconds'], start_isodatetime=str(header['start_datetime']), end_isodatetime=str(header['end_datetime']), number_channels=header['signals_in_file'], num_samples_per_channel=nsamples0, sample_frequency=fs0, signal_labels=signal_text_labels_lpch_normalized, signal_physical_mins=signal_physical_mins, signal_physical_maxs=signal_physical_maxs, signal_digital_mins=signal_digital_mins, signal_digital_maxs=signal_digital_maxs, physical_dimensions=signal_physical_dims, patient_age_days=age.total_seconds() / 86400.0, signal_prefilters=signal_prefilters, signal_transducers=signal_transducers, technician=header['technician'], studyadmincode=header['studyadmincode']) eegf.write_annotations_b( annotations_b) # may be should be called record annotations edfblock_itr = edf_block_iter_generator( ef, nsamples0, 100 * ef.samples_in_datarecord(0) * header[ 'signals_in_file'], # samples_per_chunk roughly 100 datarecords at a time dtype='int32') signals = eegf.stream_dig_signal_to_record_block(rec, edfblock_itr) return True, validator # we succeeded
from dynaconf import settings EEGML_TEMPLE_SZv151 = "/mnt/data1/eegdbs/TUH/temple/tuh_eeg_seizure/v1.5.1" # %% settings.as_dict() # %% settings.COMMENTJSON_ENABLED_FOR_DYNACONF # %% # %% # %% {"colab": {}, "colab_type": "code", "id": "iWiqgdeS60E9"} ef = edflib.EdfReader('/mnt/data1/eegdbs/stevenson_neonatal_eeg/edf/eeg10.edf') # %% {"colab": {"base_uri": "https://localhost:8080/", "height": 34}, "colab_type": "code", "id": "aI5PiQ9E60MH", "outputId": "d1191754-f4f6-4948-b453-80d0f6ca7e4f"} ef.read_annotations() # %% {"colab": {"base_uri": "https://localhost:8080/", "height": 374}, "colab_type": "code", "id": "TuPdjP8h60O6", "outputId": "5bf5dc0c-0d3e-43bd-8661-f3bc78dcb9b1"} labels = ef.get_signal_text_labels() labels # %% {"colab": {"base_uri": "https://localhost:8080/", "height": 68}, "colab_type": "code", "id": "gS4apn7160Rs", "outputId": "53090a54-d8b4-4b70-db46-0378dd260d72"} ef.get_samples_per_signal() # %% {"colab": {"base_uri": "https://localhost:8080/", "height": 51}, "colab_type": "code", "id": "nlGv2EzM60Uh", "outputId": "6ff3acee-28c8-4185-f781-ffa3ef2080d3"} ef.get_signal_freqs() # %% {"colab": {"base_uri": "https://localhost:8080/", "height": 34}, "colab_type": "code", "id": "jmaGFIEY-HA1", "outputId": "3a1b1463-3de5-41db-d33d-a88b721c53bc"}