not_matched.append(f'{p_name}') return matches, not_matched #%% main if __name__ == '__main__': max_age_diff = cfg.max_age_diff # get the mappings from names to codes mappings = misc.get_mapping() # get the list of all subjects and controls patients_all = read_subjects(cfg.patients_csv) controls_all = read_subjects(cfg.controls_csv) # ignore these items when creating the matching to_discard = [line[0] for line in misc.read_csv(cfg.edfs_discard) if line[2]=='1'] controls = {c:controls_all[c] for c in controls_all.copy() if not c in to_discard} patients = {p:patients_all[p] for p in patients_all.copy() if not p in to_discard} # matches, not_matched = greedy_matching(patients.copy(), controls.copy(), max_age_diff=max_age_diff) matches, not_matched = bootstrap_matchings(patients.copy(), controls.copy(), iterations=10000000, max_age_diff=max_age_diff) # matches, not_matched = pymatch_matching(patients.copy(), controls.copy()) check_matches_unique(matches, not_matched) # now we create the csv_string that we will write to a file: lines = ['#Patient Name; Patient Code; Patient Gender; Patient Age; Control Name; Control Code; Control Gender; Control Age; Difference'] for diff, match_i in enumerate(matches): #last one lines += [''] # add empty line before each new age diff section lines += [f'# +-{diff} age difference, {len(match_i)} matchings'] for p_name, c_name in match_i:
import matplotlib import matplotlib.pyplot as plt from misc import read_csv from misc import markers from misc import colors from misc import exist from misc import err args = sys.argv if len(args) < 3: print( "python3 csv_spectra_plot_class.py [csv input file with spectra (nm)] [selected field for legending]" ) '''read the csv and locate the spectra''' fields, data = read_csv(args[1]) nf = len(fields) # number of fields f_i = {fields[i]: i for i in range(nf)} if len(args) < 3: # call the program on all fields! for f in fields: if (f[-2:] != 'nm') and \ (f not in ['ObjectID', 'GlobalID', 'x', 'y', 'ctr_lat', 'ctr_lon', 'image']): cmd = 'python3 ' + __file__ + ' ' + args[1] + ' ' + f print(cmd) a = os.system(cmd) sys.exit(1) if args[2] not in fields: print("Error: field not found:", args[2])
def to_unisens(edf_file, unisens_folder, overwrite=False, tqdm_desc=None, skip_exist=False): pass # %% create unisens if tqdm_desc is None: tqdm_desc = lambda x: None dtype = np.int16 code = ospath.basename(edf_file)[:-4] folder = ospath.dirname(edf_file) unisens_folder = ospath.join(unisens_folder, code) if skip_exist and ospath.isdir(unisens_folder): return # get all additional files that belong to this EDF add_files = ospath.list_files(folder, patterns=code + '*') u = Patient(unisens_folder, makenew=False, autosave=True, measurementId=code) header = read_edf_header(edf_file) all_labels = header['channels'] u.starttime = header['startdate'] u.timestampStart = header['startdate'].strftime('%Y-%m-%dT%H:%M:%S') u.code = code attribs = misc.get_attribs() u.group = attribs[code].get('group', 'none') u.gender = attribs[code].get('gender', 'none') u.drug_hrv = attribs[code].get('drug_hrv', 0) u.drug_sleep = attribs[code].get('drug_sleep', 0) u.age = attribs[code].get('age', -1) u.match = attribs[code].get('match', '') u.channels = str(', '.join(header['channels'])) u.startsec = (u.starttime.hour * 60 + u.starttime.minute) * 60 + u.starttime.second u.use_offset = 1 # if the ECG/EEG is broken, mark it edfs_ecg_broken = [ p[1] for p in misc.read_csv(cfg.edfs_discard) if p[3] == '1' ] edfs_eeg_broken = [ p[1] for p in misc.read_csv(cfg.edfs_discard) if p[4] == '1' ] # we need to see if the eeg/emg of this file can be used # if one of them is broken we also remove its match from analysis u.ecg_broken = (code in edfs_ecg_broken) or (u.match in edfs_ecg_broken) u.eeg_broken = (code in edfs_eeg_broken) or (u.match in edfs_eeg_broken) # %% #### add ECG ########## ######################## tqdm_desc(f'{code}: Reading ECG') if not 'ECG' in u or overwrite: signals, shead, header = read_edf(edf_file, ch_names=['ECG I'], digital=True, verbose=False) signals[:, 0:2] = np.percentile(signals, 10), np.percentile( signals, 90) # trick for viewer automatic scaling pmin, pmax = shead[0]['physical_min'], shead[0]['physical_max'] dmin, dmax = shead[0]['digital_min'], shead[0]['digital_max'] lsb, offset = sleep_utils.minmax2lsb(dmin, dmax, pmin, pmax) attrib = { 'data': signals.astype(dtype), 'sampleRate': shead[0]['sample_rate'], 'ch_names': 'ECG', 'lsbValue': lsb, 'baseline': offset, 'unit': 'mV', 'dmin': dmin, 'dmax': dmax, 'pmin': pmin, 'pmax': pmax } SignalEntry(id='ECG.bin', parent=u).set_data(**attrib) u.sampling_frequency = shead[0]['sample_rate'] u.duration = len(signals.squeeze()) // shead[0]['sample_rate'] u.epochs_signals = signals.shape[1] // int(u.sampling_frequency) // 30 # %%#### add EEG ########## ############################## tqdm_desc(f'{code}: Reading EEG') if not 'EEG' in u or overwrite: chs = sleep_utils.infer_eeg_channels(all_labels) signals, shead, header = read_edf(edf_file, ch_names=chs, digital=True, verbose=False) if isinstance(signals, list): signals = np.atleast_2d(signals[0]) chs = chs[0] # trick for viewer automatic scaling signals[:, 0:2] = np.percentile(signals, 10), np.percentile(signals, 90) pmin, pmax = shead[0]['physical_min'], shead[0]['physical_max'] dmin, dmax = shead[0]['digital_min'], shead[0]['digital_max'] lsb, offset = sleep_utils.minmax2lsb(dmin, dmax, pmin, pmax) attrib = { 'data': signals.astype(dtype), 'sampleRate': shead[0]['sample_rate'], 'ch_names': chs, 'lsbValue': lsb, 'baseline': offset, 'contentClass': 'EEG', 'unit': 'uV', 'dmin': dmin, 'dmax': dmax, 'pmin': pmin, 'pmax': pmax } SignalEntry(id='EEG.bin', parent=u).set_data(**attrib) # %%## add EOG ######### ####################### if not 'EOG' in u or overwrite: tqdm_desc(f'{code}: Reading EOG') chs = sleep_utils.infer_eog_channels(all_labels) signals, shead, header = read_edf(edf_file, ch_names=chs, digital=True, verbose=False) if isinstance(signals, list): signals = np.atleast_2d(signals[0]) chs = chs[0] # trick for viewer automatic scaling signals[:, 0:2] = np.percentile(signals, 10), np.percentile(signals, 90) pmin, pmax = shead[0]['physical_min'], shead[0]['physical_max'] dmin, dmax = shead[0]['digital_min'], shead[0]['digital_max'] lsb, offset = sleep_utils.minmax2lsb(dmin, dmax, pmin, pmax) attrib = { 'data': signals.astype(dtype), 'sampleRate': shead[0]['sample_rate'], 'ch_names': chs, 'lsbValue': 1, 'baseline': 0, 'unit': 'uV', 'dmin': dmin, 'dmax': dmax, 'pmin': pmin, 'pmax': pmax } SignalEntry(id='EOG.bin', parent=u).set_data(**attrib) # %%#### add EMG ######### if not 'EMG' in u or overwrite: tqdm_desc(f'{code}: Reading EMG') chs = sleep_utils.infer_emg_channels(all_labels) if chs != []: # fix for 888_49272 signals, shead, header = read_edf(edf_file, ch_names=chs, digital=True, verbose=False) if isinstance(signals, list): signals = np.atleast_2d(signals[0]) chs = chs[0] # trick for viewer automatic scaling signals[:, 0:2] = np.percentile(signals, 10), np.percentile(signals, 90) pmin, pmax = shead[0]['physical_min'], shead[0]['physical_max'] dmin, dmax = shead[0]['digital_min'], shead[0]['digital_max'] lsb, offset = sleep_utils.minmax2lsb(dmin, dmax, pmin, pmax) attrib = { 'data': signals.astype(dtype), 'sampleRate': shead[0]['sample_rate'], 'ch_names': chs, 'lsbValue': 1, 'baseline': 0, 'unit': 'uV', 'dmin': dmin, 'dmax': dmax, 'pmin': pmin, 'pmax': pmax } SignalEntry(id='EMG.bin', parent=u).set_data(**attrib) ####################################### # %%add Thorax ######### ###################### if not 'thorax' in u or overwrite: tqdm_desc(f'{code}: Reading Thorax') signals, shead, header = read_edf(edf_file, ch_names=['Thorax'], digital=True, verbose=False) # trick for viewer automatic scaling signals[:, 0:2] = np.percentile(signals, 10), np.percentile(signals, 90) pmin, pmax = shead[0]['physical_min'], shead[0]['physical_max'] dmin, dmax = shead[0]['digital_min'], shead[0]['digital_max'] lsb, offset = sleep_utils.minmax2lsb(dmin, dmax, pmin, pmax) attrib = { 'data': signals.astype(dtype), 'sampleRate': shead[0]['sample_rate'], 'ch_names': 'thorax', 'lsbValue': 1, 'baseline': 0, 'unit': 'uV', 'dmin': dmin, 'dmax': dmax, 'pmin': pmin, 'pmax': pmax } SignalEntry(id='thorax.bin', parent=u).set_data(**attrib) ####################################### # %% add Body / Lagesensor ######### ######################################## if (not 'body' in u or overwrite) and 'Body' in all_labels: tqdm_desc(f'{code}: Reading Body') signals, shead, header = read_edf(edf_file, ch_names=['Body'], digital=True, verbose=False) signals[:, 0:2] = np.percentile(signals, 10), np.percentile(signals, 90) if np.ptp( signals ) < 10: # we have some weird body positions that we cant decode pmin, pmax = shead[0]['physical_min'], shead[0]['physical_max'] dmin, dmax = shead[0]['digital_min'], shead[0]['digital_max'] comment = 'Lagesensor: 1 = Bauchlage, 2 = aufrecht, 3 = links, 4 = rechts,' \ '5 = aufrecht (Kopfstand), 6 = Rückenlage' lsb, offset = sleep_utils.minmax2lsb(dmin, dmax, pmin, pmax) attrib = { 'data': signals.astype(dtype), 'sampleRate': shead[0]['sample_rate'], 'ch_names': 'body', 'lsbValue': 1, 'baseline': 0, 'unit': 'uV', 'dmin': dmin, 'dmax': dmax, 'pmin': pmin, 'pmax': pmax, 'comment': comment } SignalEntry(id='body.bin', parent=u).set_data(**attrib) # %% add annotations ####### ################################ if not 'annotations' in u or overwrite: annotations = header['annotations'] if annotations != []: annot_entry = EventEntry('annotations.csv', parent=u) annotations = [[int(a[0] * 1000), a[2]] for a in annotations] annot_entry.set_data(annotations, sampleRate=1000, typeLength=1, contentClass='Annotation') # %%#### add rest ####### ############################ for file in add_files: # ignore diagnosis files of StanfordStages if file.endswith( ('diagnosis.txt', 'hypnodensity.txt', 'hypnogram.txt')): # pass # %% add arousals elif file.endswith('_arousal.txt'): if 'arousals' in u and not overwrite: continue lines = misc.read_csv(file, convert_nums=True) sdate = u.starttime data = [] for t_arousal, length, _ in lines[4:]: t_arousal = f'{sdate.year}.{sdate.month}.{sdate.day} ' + t_arousal[: 8] t_arousal = datetime.strptime(t_arousal, '%Y.%m.%d %H:%M:%S') epoch = (t_arousal - sdate).seconds // 30 data += [[epoch, length]] arousal_event = EventEntry('arousals.csv', parent=u) arousal_event.set_data( data, comment='Arousal appearance epoch, name is lengths in seconds', sampleRate=1 / 30, contentClass='Arousal', typeLength=1) # %% add hypnogram elif file.endswith('txt'): if 'hypnogram' in u and not overwrite: continue tqdm_desc(f'{code}: Reading Hypnogram') hypno = sleep_utils.read_hypnogram(file) u.epochs_hypno = len(hypno) times = np.arange(len(hypno)) hypno = np.vstack([times, hypno]).T hypno_entry = EventEntry(id='hypnogram.csv', parent=u) hypno_entry.set_data( hypno, comment=f'File: {code}\nSleep stages 30s epochs.', sampleRate=1 / 30, contentClass='Stage', typeLength=1) elif file.endswith('.hypno'): if 'hypnogram_old' in u and not overwrite: continue hypno = sleep_utils.read_hypnogram(file) if not hasattr(u, 'epochs_hypno'): u.epochs_hypno = len(hypno) times = np.arange(len(hypno)) hypno = np.vstack([times, hypno]).T hypno_old_entry = EventEntry(id='hypnogram_old.csv', parent=u) hypno_old_entry.set_data( hypno, comment=f'File: {code}\nSleep stages 30s epochs.', sampleRate=1 / 30, contentClass='Stage', typeLength=1) # %% add features and kubios elif file.endswith('mat'): if 'feats.pkl' in u and not overwrite: continue tqdm_desc(f'{code}: Reading Kubios') mat = loadmat(file) HRV = mat['Res']['HRV'] feats_entry = CustomEntry('feats.pkl', parent=u) feats_entry.set_data( HRV, comment='pickle dump of the kubios created features file', fileType='pickle') wsize = cfg.default_wsize step = cfg.default_step offset = True u.compute_features() u.get_artefacts(wsize=wsize, step=step, offset=True) #%% add RRi tqdm_desc(f'{code}: writing RRi') rri_entry = CustomEntry('RRi.pkl', parent=u) rri_entry.set_data( HRV['Data']['RRi'], comment='raw data of RRi, the interpolated RRs at 4hz', fileType='pickle') rri_entry.sampleRate = 4 # add artefact ############ removed artefact detection and calculated from kubios above # elif file.endswith('npy'): # if 'artefacts' in u and not overwrite: continue # tqdm_desc(f'{code}: Reading artefacts') # art = np.load(file).ravel() # u.epochs_art = len(art)//2 # u.artefact_percentage = np.mean(art) # times = np.arange(len(art)) # art = np.vstack([times, art]).T # artefact_entry = ValuesEntry(id='artefacts.csv', parent=u) # artefact_entry.set_data(art, sampleRate=1/15, dataType='int16') elif file.endswith(('.edf', 'pkl')): pass else: raise Exception(f'unkown file type: {file}') u.save()
from misc import read_csv import shutil import ospath import config as cfg from tqdm import tqdm if __name__ == '__main__': documents = cfg.documents datasets = [ ospath.join(documents, 'mapping_' + d + '.csv') for d in cfg.datasets ] matching = cfg.matching set1_path = ospath.join(cfg.folder_edf, 'set1') set2_path = ospath.join(cfg.folder_edf, 'set2') matchings = read_csv(matching) set1 = read_csv(datasets[0]) set2 = read_csv(datasets[1]) os.makedirs(ospath.join(cfg.folder_edf, 'set1'), exist_ok=True) os.makedirs(ospath.join(cfg.folder_edf, 'set2'), exist_ok=True) os.makedirs(ospath.join(cfg.folder_edf, 'set1', 'not_matched'), exist_ok=True) os.makedirs(ospath.join(cfg.folder_edf, 'set2', 'not_matched'), exist_ok=True) # copy the files into nt1:matched set1 and nt1:matched set2 respectively for p_orig, p_coded, gender, age, c_name, c_coded, c_gender, c_age, diff in tqdm( matchings): if int(diff) > cfg.max_age_diff: break
def anonymize_and_streamline(old_file, target_folder): """ This function loads the edfs of a folder and 1. removes their birthdate and patient name 2. renames the channels to standardized channel names 3. saves the files in another folder with a non-identifyable 4. verifies that the new files have the same content as the old """ # load the two csvs with the edfs that we dont process and where the ECG is upside down pre_coding_discard = [ line[0] for line in misc.read_csv(cfg.edfs_discard) if line[2] == '1' ] to_invert = [line[0] for line in misc.read_csv(cfg.edfs_invert)] # Here we read the list of controls and patients with their age and gender mappings = misc.read_csv(cfg.controls_csv) mappings.extend(misc.read_csv(cfg.patients_csv)) mappings = dict([[name, { 'gender': gender, 'age': age }] for name, gender, age, *_ in mappings]) # old name is the personalized file without file extension, e.g. thomas_smith(1) old_name = ospath.splitext(ospath.basename(old_file))[0] # new name is the codified version without extension e.g '123_45678' new_name = codify(old_name) # use a temporary file to write and then move it, # this avoids half-written files that cannot be read later tmp_name = tempfile.TemporaryFile(prefix='anonymize').name if old_name in pre_coding_discard: print('EDF is marked as corrupt and will be discarded') return # this is where the anonymized file will be stored new_file = ospath.join(target_folder, new_name + '.edf') if ospath.exists(new_file): print('New file extists already {}'.format(new_file)) else: # anonymize print('Writing {} from {}'.format(new_file, old_name)) assert ospath.isfile(old_file), f'{old_file} does not exist' signals, signal_headers, header = sleep_utils.read_edf(old_file, digital=True, verbose=False) # remove patient info header['birthdate'] = '' header['patientname'] = new_name header['patientcode'] = new_name header['gender'] = mappings[old_name]['gender'] header['age'] = mappings[old_name]['age'] # rename channels to a unified notation, e.g. EKG becomes ECG I for shead in signal_headers: ch = shead['label'] if ch in ch_mapping: ch = ch_mapping[ch] shead['label'] = ch # Invert the ECG channel if necessary if old_name in to_invert: for i, sig in enumerate(signals): label = signal_headers[i]['label'].lower() if label == cfg.ecg_channel.lower(): signals[i] = -sig # we write to tmp to prevent that corrupted files are not left print('Writing tmp for {}'.format(new_file)) sleep_utils.write_edf(tmp_name, signals, signal_headers, header, digital=True, correct=True) # verify that contents for both files match exactly print('Verifying tmp for {}'.format(new_file)) # embarrasing hack, as dmin/dmax dont in this files after inverting if not old_name == 'B0036': sleep_utils.compare_edf(old_file, tmp_name, verbose=False) # now we move the tmp file to its new location. shutil.move(tmp_name, new_file) # also copy additional file information ie hypnograms and kubios files old_dir = ospath.dirname(old_file) pattern = old_name.replace('_m', '').replace( '_w', '') # remove gender from weitere nt1 patients add_files = ospath.list_files( old_dir, patterns=[f'{pattern}*txt', f'{pattern}*dat', f'{pattern}*mat']) for add_file in add_files: # e.g. .mat or .npy etc etc new_add_file = ospath.join( target_folder, ospath.basename(add_file.replace(pattern, new_name))) if ospath.exists(new_add_file): continue # hypnograms will be copied to .hypno try: new_add_file = new_add_file.replace('-Schlafprofil', '') new_add_file = new_add_file.replace('_sl', '') new_add_file = new_add_file.replace('.txt', '.hypno').replace( '.dat', '.hypno') shutil.copy(add_file, new_add_file) except Exception as e: print(e) return old_name, new_name
'''20211128 averaging over a window, where the windowed data are from: raster_extract_spectra.py''' import os import sys import csv from misc import read_csv from misc import exist from misc import err args = sys.argv in_f = args[1] if not exist(in_f): err('could not find input file: ' + in_f) '''read the csv and locate the spectra''' fields, data = read_csv(in_f) fields = [x.strip().replace(',', '_') for x in fields] # forbid comma in header nf = len(fields) # number of fields f_i = {fields[i]:i for i in range(nf)} '''insist on fields xoff and yoff''' if (not 'xoff' in fields) or (not 'yoff' in fields): err("missing req'd fields: xoff, yoff") spec_fi, nonspec_fi = [], [] # list col-idx for all spectral data columns for i in range(nf): if fields[i][-2:] == 'nm': spec_fi += [i] else: # list non-spec fields except: offset-index coding analysis-window pos'n if fields[i] not in ['xoff', 'yoff', 'row', 'lin']: nonspec_fi += [i]
args = sys.argv from multiprocessing import Lock lock = Lock() n_processed = 0 if len(args) < 5: err('python3 csv_spectra_distance_simple.py [csv spectra file (one spectrum)] ' + ' [field to select from] [field value to select]' + ' [raster file]') csv_fn, dfn = args[1], args[4] select_field = args[2] select_value = args[3] '''read the csv and locate the spectra''' fields, csv_data = read_csv(csv_fn) nf = len(fields) # number of fields f_i = {fields[i]:i for i in range(nf)} spec_fi = [] for i in range(nf): if fields[i][-2:] == 'nm': spec_fi += [i] print('spectra col-ix', spec_fi) print('number of cols', len(spec_fi)) select_i = f_i[select_field] # index of col indicated to match on.. ''' average the spectra where field select_field matches the value select_value''' N = len(csv_data[0]) # number of data points n_select, spec_avg = 0., [0. for i in range(len(spec_fi))] # averaged spectra goes here
from misc import read_csv, generate_csv import csv backlink_list = read_csv('backlink.csv') domain_list = [] with open('buyers_guide.csv', newline='') as csvfile: linereader = csv.reader(csvfile) for line in linereader: domain_list.append(line[0]) audit_domain_list = [] audit_backlink = [] for backlink in backlink_list: try: index = domain_list.index(backlink[3]) except: index = None if index is not None: # {'backlink' : backlink, 'domain' : backlink[0]} audit_backlink.append(backlink) if backlink[3] not in audit_domain_list: audit_domain_list.append(domain_list[index]) with open('multiple_audit_backlinks_bg.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=',') for backlink in audit_backlink: writer.writerow(backlink) csv_columns = ['URL', 'Audited'] with open('audit_domains_bg.csv', 'w', newline='') as csvfile: