def temporal(self, dct, now, args): needs_processing = {k : bool(v.get()) for k, v in dct['temporal_specific'].iteritems()} out_dir = dct['out_dir'].get() + '/' + now + '/data/' util.make_dir(out_dir) min_sup = float(dct['temporal_specific']['support'].get()) if not dct['temporal_specific']['sequences_available'].get(): # if enrichment is enabled, we create a different object instance than usual if dct['enrich'].get(): seq_p = SequenceEnrichProcess(*args, mapping_files_dir=dct['mapping_dir'].get()) name = 'sequences_enriched' elif dct['temporal_specific']['anti-knowledge-driven'].get(): seq_p = NonMarshallSequenceProcess(*args) name = 'sequences_excl_marshall' else: seq_p = SequenceProcess(*args) name = 'sequences' seq_p.process(needs_processing) seq_p.sort_sequences() seq_p.save_output(sequence_file=True, sub_dir='data/tmprl', name=name) generate_pattern_occurrences_per_patient(out_dir, seq_p.id2data, min_sup, dct['mapping_dir'].get()) sequence_f = out_dir + '/tmprl/{}.csv'.format(name) else: sequence_f = dct['temporal_specific']['sequence_file'].get() generate_pattern_occurrences_per_patient(out_dir, sequence_f, min_sup, dct['mapping_dir'].get())
def export(self, out_dir): '''export results''' util.make_dir(out_dir) io.dict2csv(self.code2indications, out_dir + 'indication.csv') io.dict2csv(self.code2effects, out_dir + 'effect.csv') io.dict2csv(self.code2ingredients, out_dir + 'ingredient.csv')
def export(self, out_dir): '''export results''' util.make_dir(out_dir) io.dict2csv(self.code2manifestation_of, out_dir + 'manifestationof.csv') io.dict2csv(self.code2association, out_dir + 'association.csv')
def save_statistics(self, sub_dir='data', name='unnamed'): out_dir = self.out_dir + '/' + sub_dir + '/' util.make_dir(out_dir) f_out = out_dir + name + '.csv' with open(f_out, 'w') as f: for key, value in self.statistics.items(): f.write('%s:%s\n' % (key, value))
def go(self, button): '''initiates the associated algorithms ''' dct = self.user_input button.config(text='Running', state=DISABLED) if dct['in_dir'].get() == 'input folder': dct['in_dir'].set('sql') if dct['delimiter'].get() == '': dct['delimiter'].set(',') if dct['out_dir'].get() == 'output folder': dct['out_dir'].set('./out') if dct['min_age'].get() == '': dct['min_age'].set(30) if dct['max_age'].get() == '': dct['max_age'].set(150) if dct['begin_interval'].get() == '': dct['begin_interval'].set(int(365./52*26+1)) if dct['end_interval'].get() == '': dct['end_interval'].set(int(365./52*0+1)) if dct['ID_column'].get() == '': dct['ID_column'].set('patientnummer') if dct['temporal_specific']['support'].get() == '': dct['temporal_specific']['support'].set(0.1) if dct['mapping_dir'].get() == 'semantic enrichment dir': dct['mapping_dir'].set('./out/semantics/') self.master.update_idletasks() now = util.get_current_datetime() util.make_dir(dct['out_dir'].get() + '/' + now + '/') HISes = [dct['PMO'].get(), dct['MDM'].get(), dct['LUMC'].get(), dct['VUMH'].get(), dct['VUMD'].get(), dct['VUSC'].get()] args = [dct['in_dir'].get(), dct['delimiter'].get(), dct['out_dir'].get() + '/' + now, dct['ID_column'].get(), int(dct['min_age'].get()), int(dct['max_age'].get()), [int(dct['end_interval'].get()), int(dct['begin_interval'].get())], True if dct['in_dir'].get().lower() == 'sql' else False, HISes] if dct['process_temporal'].get(): # process temporally self.temporal(dct, now, args) else: # process atemporally self.regular(dct, now, args) pretty_dct = util.tkinter2var(dct) try: io.pprint_to_file(dct['out_dir'].get() + '/' + now + '/settings.txt', pretty_dct) except IOError, e: print e
def save_output(self, benchmark=False, sequence_file=False, sub_dir='', name='unnamed', target=False): '''saves processed data to the specified output directory''' print('...saving processed data' ) # to {}'.format('sql' if self.from_sql else 'file') headers = self.headers # print (self.id2data.values()) # print('x') # if we didn't get the data from sql database, just save to .csv if True or not self.from_sql: # possibly make new directories out_dir = self.out_dir + '/' + sub_dir + '/' util.make_dir(out_dir) f_out = out_dir + name + '.csv' out = write_csv(f_out) # write headers where required if benchmark: out.writerow(headers[0:3]) elif target: out.writerow([headers[0], headers[-1]]) elif sequence_file: pass else: out.writerow([headers[0]] + headers[3:-1]) # write data for value in self.id2data.values(): data = value['data'] if benchmark: data = data[0:3] data[2] = 1 if data[2] == 'V' else 0 elif target: if self.survival == False: data = [data[0], 0 if data[-1] == 'negative' else 1] else: data = [data[0], data[-1]] elif sequence_file: pass else: data = [data[0]] + data[3:-1] out.writerow(data)
def temporal(self, dct, now, args): needs_processing = { k: bool(v.get()) for k, v in dct['temporal_specific'].items() } out_dir = dct['out_dir'].get() + '/' + now + '/data/' util.make_dir(out_dir) # minimal support is set here min_sup = float(dct['temporal_specific']['support'].get()) # if there are no sequences available if not dct['temporal_specific']['sequences_available'].get(): # if enrichment is enabled, we create a different object instance than usual # if enriched # if dct['enrich'].get(): # seq_p = SequenceEnrichProcess(*args, mapping_files_dir=dct['mapping_dir'].get()) # name = 'sequences_enriched' # if not enriched and no marshall predictors if dct['temporal_specific']['anti-knowledge-driven'].get(): seq_p = NonMarshallSequenceProcess(*args) name = 'sequences_excl_marshall' else: seq_p = SequenceProcess(*args) name = 'sequences' seq_p.process(needs_processing) seq_p.sort_sequences() seq_p.save_output(sequence_file=True, sub_dir='data/tmprl', name=name) generate_pattern_occurrences_per_patient(out_dir, seq_p.id2data, min_sup, dct['mapping_dir'].get()) sequence_f = out_dir + '/tmprl/{}.csv'.format(name) else: sequence_f = dct['temporal_specific']['sequence_file'].get() generate_pattern_occurrences_per_patient(out_dir, sequence_f, min_sup, dct['mapping_dir'].get())
def go(self, button): '''initiates the associated algorithms ''' dct = self.user_input button.config(text='Running', state=DISABLED) if dct['in_dir'].get() == 'input folder': dct['in_dir'].set('/Users/Reiny/Documents/UI_CRC/playground') if dct['delimiter'].get() == '': dct['delimiter'].set(',') if dct['out_dir'].get() == 'output folder': dct['out_dir'].set('/Users/Reiny/Documents/UI_CRC/out') self.master.update_idletasks() util.make_dir(dct['out_dir'].get() + '/') HISes = [ dct['PMO'].get(), dct['MDM'].get(), dct['LUMC'].get(), dct['VUMH'].get(), dct['VUMD'].get(), dct['VUSC'].get() ] args = [ dct['in_dir'].get(), dct['delimiter'].get(), dct['out_dir'].get() + '/' + dct['output_id'].get() + '.csv', dct['age+gender'].get(), dct['counts_med'].get(), dct['counts_med_enrich'].get(), dct['counts_consult'].get(), dct['counts_consult_enrich'].get(), dct['counts_referral'].get(), dct['counts_lab'].get(), dct['tmprl'].get(), dct['enriched_tmprl'].get(), dct['knowledge_driven'].get(), dct['anti_knowledge_driven'].get(), dct['anti_knowledge_driven_tmprl'].get(), dct['separate'].get(), HISes ] # merge combine.execute(*args) button.config(text='Done') self.master.update_idletasks() time.sleep(0.5) button.config(text='Run!', state=NORMAL)
def go(self, button): '''initiates the associated algorithms ''' dct = self.user_input button.config(text='Running', state=DISABLED) self.master.update_idletasks() util.make_dir(dct['f_out'].get()) report = Report(dct['f_general'].get(), dct['f_data'].get(), dct['f_predictors'].get(), dct['f_out'].get(), float(dct['feature-threshold'].get())) report.compile() report.export() print '### Done processing ###' button.config(text='Done') self.master.update_idletasks() time.sleep(0.5) button.config(text='Run!', state=NORMAL)
def go(self, button): '''initiates the associated algorithms ''' dct = self.user_input button.config(text='Running', state=DISABLED) self.master.update_idletasks() util.make_dir(dct['f_out'].get()) report = Report(dct['f_general'].get(), dct['f_data'].get(), dct['f_predictors'].get(), dct['f_out'].get(), float(dct['feature-threshold'].get()) ) report.compile() report.export() print '### Done processing ###' button.config(text='Done') self.master.update_idletasks() time.sleep(0.5) button.config(text='Run!', state=NORMAL)
def go(self, button): '''initiates the associated algorithms ''' dct = self.user_input button.config(text='Running', state=DISABLED) if dct['in_dir'].get() == 'input folder': dct['in_dir'].set('sql') if dct['delimiter'].get() == '': dct['delimiter'].set(',') if dct['out_dir'].get() == 'output folder': dct['out_dir'].set('./out') if dct['min_age'].get() == '': dct['min_age'].set(30) if dct['max_age'].get() == '': dct['max_age'].set(150) if dct['begin_interval'].get() == '': dct['begin_interval'].set(int(365. / 52 * 26 + 1)) if dct['end_interval'].get() == '': dct['end_interval'].set(int(365. / 52 * 0 + 1)) if dct['ID_column'].get() == '': dct['ID_column'].set('patientnummer') if dct['temporal_specific']['support'].get() == '': dct['temporal_specific']['support'].set(0.1) # if dct['mapping_dir'].get() == 'semantic enrichment dir': # dct['mapping_dir'].set('./out/semantics/') self.master.update_idletasks() now = util.get_current_datetime() util.make_dir(dct['out_dir'].get() + '/' + now + '/') # HISes = [dct['PMO'].get(), dct['MDM'].get(), dct['LUMC'].get(), # dct['VUMH'].get(), dct['VUMD'].get(), dct['VUSC'].get()] args = [ dct['in_dir'].get(), dct['delimiter'].get(), dct['out_dir'].get() + '/' + now, dct['ID_column'].get(), int(dct['min_age'].get()), int(dct['max_age'].get()), [int(dct['end_interval'].get()), int(dct['begin_interval'].get())], True if dct['in_dir'].get().lower() == 'sql' else False, False, dct['survival'].get(), dct['already_processed'].get() ] if dct['process_temporal'].get(): # process temporally self.temporal(dct, now, args) else: # process atemporally self.regular(dct, now, args) pretty_dct = util.tkinter2var(dct) try: io.pprint_to_file( dct['out_dir'].get() + '/' + now + '/settings.txt', pretty_dct) except IOError as e: print(e) print('### Done processing ###') button.config(text='Done') self.master.update_idletasks() time.sleep(0.5) button.config(text='Run!', state=NORMAL)
def execute(in_dir, out_dir, record_id, target_id, day_id, day, algorithms, feature_selection, separate_testset, in_dir_test): '''executes the learning task on the data in in_dir with the algorithms in algorithms. The results are written to out_dir and subdirectories, and the record_ and target_ids are used to differentiate attributes and non-attributes''' print '### executing learning algorithms on... ###' # get the files files = util.list_dir_csv(in_dir) # stop if no files found if not files: print 'No appropriate csv files found. Select an input directory with appropriate files' return if separate_testset: files_test = util.list_dir_csv(in_dir_test) else: files_test = files # create directory util.make_dir(out_dir) # execute each algorithm for alg in algorithms: print '...{}'.format(alg) util.make_dir(out_dir+'/'+alg+'/') results_list = [] if separate_testset: results_list2 = [] util.make_dir(out_dir+'/'+alg+'_test/') # list which will contain the results # run algorithm alg for each file f for f, f_test in zip(files,files_test): fname = in_out.get_file_name(f, extension=False) print ' ...{}'.format(fname) # get data, split in features/target. If invalid stuff happened --> exit X, y, headers = in_out.import_data(f, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target if type(X) == bool: return # if separate_testset: # X, X_te = X # y, y_te = y # print ' ...train instances: {}, attributes: {}'.format(X.shape[0], X.shape[1]) # print ' ...test instances: {}, attributes: {}'.format(X_te.shape[0], X_te.shape[1]) # else: # Now remove the ones without a relevant day: new_headers = [h for h in headers if not h == day_id] day_index = headers.index(day_id) new_X = np.zeros((0, len(headers))) new_y = [] for i in range(0, X.shape[0]): if X[i,headers.index(day_id)] == day: row = np.array(X[i,:]).reshape(-1) new_X = np.append(new_X, np.column_stack(row), axis=0) new_y.append(int(y[i])) new_X = np.delete(new_X, day_index, 1) X = new_X y = np.squeeze(np.asarray(new_y)) print ' ...instances: {}, attributes: {}'.format(X.shape[0], X.shape[1]) model, best_features, results = execute_with_algorithm(alg, X, y, fname, headers, out_dir+'/'+alg+'/', record_id, target_id, feature_selection) results_list.append(results) if separate_testset: X, y, headers = in_out.import_data(f_test, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target if type(X) == bool: return print ' ...instances: {}, attributes: {} (test set)'.format(X.shape[0], X.shape[1]) results = predict_separate(X, y, fname, out_dir+'/'+alg+'_test/', record_id, target_id, feature_selection, model, best_features) results_list2.append(results) try: in_out.save_ROC(out_dir+'/'+alg+'/'+"roc.png", results_list, title='ROC curve') except IndexError: pass try: in_out.save_ROC(out_dir+'/'+alg+'_test/'+"roc.png", results_list2, title='ROC curve') except NameError: pass # notify user print '## Learning Finished ##'
def execute(in_dir, out_dir, record_id, target_id, algorithms, feature_selection, separate_testset, in_dir_test, survival, oversampling, undersampling, aggregation): '''executes the learning task on the data in in_dir with the algorithms in algorithms. The results are written to out_dir and subdirectories, and the record_ and target_ids are used to differentiate attributes and non-attributes''' print ('### executing learning algorithms on... ###') # get the files files = util.list_dir_csv(in_dir) # stop if no files found if not files: print ('No appropriate csv files found. Select an input directory with appropriate files') return if separate_testset: files_test = util.list_dir_csv(in_dir_test) else: files_test = files # create directory util.make_dir(out_dir) # execute each algorithm for alg in algorithms: print ('...{}'.format(alg)) util.make_dir(out_dir+'/'+alg+'/') results_list = [] if separate_testset: results_list2 = [] util.make_dir(out_dir+'/'+alg+'_test/') # list which will contain the results # run algorithm alg for each file f for f, f_test in zip(files,files_test): fname = in_out.get_file_name(f, extension=False) print (' ...{}'.format(fname)) # get data, split in features/target. If invalid stuff happened --> exit X, y, headers, target_list = in_out.import_data(f, record_id, target_id, survival) # assumption: first column is patientnumber and is pruned, last is target if type(X) == bool: return if aggregation == True: X, headers = aggregations(f, target_list, survival) print (' ...instances: {}, attributes: {}'.format(X.shape[0], X.shape[1])) model, best_features, results = execute_with_algorithm(alg, X, y, fname, headers, out_dir+'/'+alg+'/', record_id, target_id, feature_selection, oversampling, survival, undersampling, aggregation) results_list.append(results) if separate_testset: X, y, headers = in_out.import_data(f_test, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target if type(X) == bool: return print (' ...instances: {}, attributes: {} (test set)'.format(X.shape[0], X.shape[1])) results = predict_separate(X, y, fname, out_dir+'/'+alg+'_test/', record_id, target_id, feature_selection, model, best_features) results_list2.append(results) try: in_out.save_ROC(out_dir+'/'+alg+'/'+"roc.png", results_list, title='ROC curve') except IndexError: pass try: in_out.save_ROC(out_dir+'/'+alg+'_test/'+"roc.png", results_list2, title='ROC curve') except NameError: pass # notify user print ('## Learning Finished ##')
def execute_knn(in_dir, out_dir, record_id, target_id, day_id, day, k): '''executes the learning task on the data in in_dir with the algorithms in algorithms. The results are written to out_dir and sub_directories, and the record_ and target_ids are used to differentiate attributes and non-attributes''' print '### executing learning algorithms on... ###' # get the files files = util.list_dir_csv(in_dir) # stop if no files found if not files: print 'No appropriate csv files found. Select an input directory with appropriate files' return # create directory util.make_dir(out_dir) # execute each algorithm # run algorithm alg for each file f for f in files: results_list = [] fname = in_out.get_file_name(f, extension=False) print ' ...{}'.format(fname) # get data, split in features/target. If invalid stuff happened --> exit X, y, headers = in_out.import_data( f, record_id, target_id, True) # assumption: first column is patientnumber if type(X) == bool: return day_index = headers.index(day_id) new_X = np.zeros((0, len(headers))) new_y = [] IDs = [] IDrows = {} # ordering of time points and complete data (filled with nan's if not available) assumed! # features_to_be_removed = [ "pvc_bin","pnc_bin","pac_bin","ect_freq_bin","full_code_bin","comfort_meas_bin","other_code_bin","no_cpr_bin", # "dnr_bin","dni_bin","fall_risk_bin","orientation_ord","orient_unable_ass_bin","riker_sas_ord","vent_bin", # "vent_mode_ord","pacemaker_bin","trach_bin","flush_skin_bin","jaundice_skin_bin","pale_skin_bin","impaired_skin_bin", # "iabp_ord","iabp_bin","svnsicu_bin","svcsicu_bin","svcsru_bin","svmicu_bin","svmsicu_bin","svother_bin","svccu_bin", # "gender"] exclude = [ 146, 140, 95, 123, 88, 133, 22, 65, 49, 114, 178, 55, 133, 138, 34, 186, 20, 73 ] new_index = 0 for i in range(0, X.shape[0]): if X[i, headers.index(day_id)] == day or day == -1: row = np.array(X[i, :]).reshape(-1) if not row[0] in IDs and not row[0] in exclude: IDs.append(row[0]) new_y.append(int(y[i])) IDrows[row[0]] = [new_index] elif not row[0] in exclude: IDrows[row[0]].append(new_index) new_X = np.append(new_X, np.column_stack(row), axis=0) new_index += 1 ID_column = new_X[:, 0] # Remove the id, the day, and the time stamp from the data and headers. new_X = np.delete(new_X, 2, 1) new_X = np.delete(new_X, 1, 1) new_X = np.delete(new_X, 0, 1) new_headers = headers[3:len(headers)] dtw_attr = ['hr', 'resp', 'nbp', 'sbp', 'dbp', 'so2'] X = new_X print len(X) non_singular_rows = [ i for i in range(0, X.shape[1]) if len(set(util.get_non_nans(X[:, i].tolist()))) > 1 ] #print str(len(non_singular_rows)) + ' ' + str(X.shape[1]) #print non_singular_rows X = X[:, non_singular_rows] new_headers = np.array(new_headers)[non_singular_rows].tolist() print str( len(new_headers)) + "length new headers after non singular rows" print new_headers print "Removed columns with only nan of 1 value" max_values = np.nanmax(X, axis=0) min_values = np.nanmin(X, axis=0) ranges = [] for i in range(0, len(min_values)): diff = max_values[i] - min_values[i] if diff == 0: print 'difference of zero encountered in ' + str(i) print 'Max values: ' + str(max_values[i]) print 'Min values: ' + str(min_values[i]) ranges.append(1) else: ranges.append(diff) # Now do some scaling to get the values to the same order or magnitude scaled_X = (X - min_values) / (max_values - min_values) X = scaled_X y = np.squeeze(np.asarray(new_y)) print "Scaling done!" new_IDrows = {} for ID in IDs: IDrows[ID] = { 'first_row': min(IDrows[ID]), 'last_row': max(IDrows[ID]) } print ' ...instances: {}, attributes: {}'.format( X.shape[0], X.shape[1]) # Now we are going to build the similarity matrix. We are also going to store how many attributes # we actually able to make a comparison for. similarity_matrix = np.ones((len(IDs), len(IDs))) matching_number_matrix = np.ones((len(IDs), len(IDs))) for attr in range(0, len(new_headers)): print str(attr) + "attribute in KNN loop" print str(attr) + "/" + str(len(new_headers)) temp = np.ones((len(IDs), len(IDs))) temp[:] = 2 for i in range(0, len(IDs)): for j in range(i + 1, len(IDs)): i_data = X[IDrows[IDs[i]]['first_row']: IDrows[IDs[i]]['last_row'] + 1, attr].tolist() j_data = X[IDrows[IDs[j]]['first_row']: IDrows[IDs[j]]['last_row'] + 1, attr].tolist() if new_headers[attr] in dtw_attr: dtw_distance = dtw.lb_keogh(i_data, j_data, window) if not dtw_distance == -1: temp[i, j] += dtw_distance matching_number_matrix[i, j] += 1 matching_number_matrix[ j, i] = matching_number_matrix[i, j] temp[j, i] = temp[i, j] else: i_data = util.get_non_nans(i_data) j_data = util.get_non_nans(j_data) if len(i_data) > 0 and len(j_data) > 0: simple_distance = math.pow( np.mean(i_data) - np.mean(j_data), 2) temp[i, j] += simple_distance matching_number_matrix[i, j] += 1 matching_number_matrix[ j, i] = matching_number_matrix[i, j] temp[j, i] = temp[i, j] if np.max(temp) != 0: temp = temp / np.max(temp) similarity_matrix += temp # We calculate the average score per item matched # Best might be to apply a weighting scheme now. similarity_matrix = (similarity_matrix / matching_number_matrix) + ( 1 / matching_number_matrix) print len(IDs) results = perform_classification(similarity_matrix, y, out_dir, k) results_list.append(results) print results in_out.save_results(out_dir + str(k) + '.csv', ["fpr", "tpr", "auc", "cm"], results[1:len(results)], [sum(y), len(y)]) in_out.save_ROC(out_dir + '/roc.png', results_list, title='ROC curve') # notify user print '## Learning Finished ##' print similarity_matrix
def execute(in_dir, delim, out_file, age_gender=False, counts_med=False, counts_med_enrich=False, counts_consult=False, counts_consult_enrich=False, counts_referral=False, counts_lab=False, tmprl=False, enriched_tmprl=False, knowledge_driven=False, counts_no_knowledge=False, tmprl_no_knowledge=False, separate=False, HISes=[]): '''merge the in files to produce the out file''' merged = defaultdict(list) headers = ['ID'] # we may not need this. ID2HIS = {} merged_test = defaultdict(list) # if we wish to separate, get dictionary of patient HIS sources using SQL. if separate: c = util.sql_connect().cursor() HISes_str = "','".join(HISes) q = '''SELECT patientnummer FROM patienten WHERE PRAKTIJKCODE IN ('{}')'''.format(HISes_str) c.execute(q) ID2HIS = {row[0] : row[0] for row in c} if age_gender: headers = merge_file(in_dir+'/AG.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_med: headers = merge_file(in_dir+'/C_M.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_med_enrich: headers = merge_file(in_dir+'/C_M_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_consult: headers = merge_file(in_dir+'/C_C.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_consult_enrich: headers = merge_file(in_dir+'/C_C_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_referral: headers = merge_file(in_dir+'/C_R.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_lab: headers = merge_file(in_dir+'/C_L.csv', merged, headers, delim, separate, ID2HIS, merged_test) if tmprl: headers = merge_file(in_dir+'/T.csv', merged, headers, delim, separate, ID2HIS, merged_test) if enriched_tmprl: headers = merge_file(in_dir+'/T_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test) if knowledge_driven: headers = merge_file(in_dir+'/K.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_no_knowledge: headers = merge_file(in_dir+'/C_NK.csv', merged, headers, delim, separate, ID2HIS, merged_test) if tmprl_no_knowledge: headers = merge_file(in_dir+'/T_NK.csv', merged, headers, delim, separate, ID2HIS, merged_test) headers = merge_file(in_dir+'/CRC.csv', merged, headers, delim, separate, ID2HIS, merged_test) # now write to new file (also check whether all results have same length) make_dir(out_file) out = io.write_csv(out_file) out.writerow(headers) skip=0 for key in merged: if len(headers) != 1+len(merged[key]): print 'unequal to header amount ({} vs {})! watch out.'.format(len(headers),len(merged[key])) # skip+=1 # continue out.writerow([key] + merged[key]) if separate: out_file_test = out_file[:out_file.rfind('/')+1] + 'test' + out_file[out_file.rfind('/'):] make_dir(out_file_test) out = io.write_csv(out_file_test) out.writerow(headers) for key in merged_test: if len(headers) != 1+len(merged_test[key]): print 'unequal to header amount ({} vs {})! watch out.'.format(len(headers),len(merged_test[key])) # skip+=1 # continue out.writerow([key] + merged_test[key]) print '## Done Merging ##'
def execute(in_dir, delim, out_file, age_gender=False, counts_med=False, counts_med_enrich=False, counts_consult=False, counts_consult_enrich=False, counts_referral=False, counts_lab=False, all_counts=False, tmprl=False, enriched_tmprl=False, knowledge_driven=False, counts_no_knowledge=False, tmprl_no_knowledge=False, separate=False, HISes=[]): '''merge the in files to produce the out file''' merged = defaultdict(list) headers = ['ID'] # we may not need this. ID2HIS = {} merged_test = defaultdict(list) # if we wish to separate, get dictionary of patient HIS sources using SQL. if separate: c = util.sql_connect().cursor() HISes_str = "','".join(HISes) q = '''SELECT patientnummer FROM patienten WHERE PRAKTIJKCODE IN ('{}')'''.format(HISes_str) c.execute(q) ID2HIS = {row[0]: row[0] for row in c} if age_gender: headers = merge_file(in_dir + '/AG.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_med: headers = merge_file(in_dir + '/C_M.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_med_enrich: headers = merge_file(in_dir + '/C_M_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_consult: headers = merge_file(in_dir + '/C_C.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_consult_enrich: headers = merge_file(in_dir + '/C_C_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_referral: headers = merge_file(in_dir + '/C_R.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_lab: headers = merge_file(in_dir + '/C_L.csv', merged, headers, delim, separate, ID2HIS, merged_test) if tmprl: headers = merge_file(in_dir + '/T.csv', merged, headers, delim, separate, ID2HIS, merged_test) if enriched_tmprl: headers = merge_file(in_dir + '/T_enrich.csv', merged, headers, delim, separate, ID2HIS, merged_test) if knowledge_driven: headers = merge_file(in_dir + '/K.csv', merged, headers, delim, separate, ID2HIS, merged_test) if counts_no_knowledge: headers = merge_file(in_dir + '/C_NK.csv', merged, headers, delim, separate, ID2HIS, merged_test) if tmprl_no_knowledge: headers = merge_file(in_dir + '/T_NK.csv', merged, headers, delim, separate, ID2HIS, merged_test) if all_counts: print('ja') headers = merge_file(in_dir + '/counts.csv', merged, headers, delim, separate, ID2HIS, merged_test) headers = merge_file(in_dir + '/stroke.csv', merged, headers, delim, separate, ID2HIS, merged_test) # now write to new file (also check whether all results have same length) make_dir(out_file) out = io.write_csv(out_file) out.writerow(headers) skip = 0 for key in merged: if len(headers) != 1 + len(merged[key]): print('unequal to header amount ({} vs {})! watch out.'.format( len(headers), len(merged[key]))) # skip+=1 # continue out.writerow([key] + merged[key]) if separate: out_file_test = out_file[:out_file.rfind('/') + 1] + 'test' + out_file[out_file.rfind('/'):] make_dir(out_file_test) out = io.write_csv(out_file_test) out.writerow(headers) for key in merged_test: if len(headers) != 1 + len(merged_test[key]): print('unequal to header amount ({} vs {})! watch out.'.format( len(headers), len(merged_test[key]))) # skip+=1 # continue out.writerow([key] + merged_test[key]) print('## Done Merging ##')
def execute_knn(in_dir, out_dir, record_id, target_id, day_id, day, k): '''executes the learning task on the data in in_dir with the algorithms in algorithms. The results are written to out_dir and subdirectories, and the record_ and target_ids are used to differentiate attributes and non-attributes''' print '### executing learning algorithms on... ###' # get the files files = util.list_dir_csv(in_dir) # stop if no files found if not files: print 'No appropriate csv files found. Select an input directory with appropriate files' return # create directory util.make_dir(out_dir) # execute each algorithm # run algorithm alg for each file f for f in files: results_list = [] fname = in_out.get_file_name(f, extension=False) print ' ...{}'.format(fname) # get data, split in features/target. If invalid stuff happened --> exit X, y, headers = in_out.import_data( f, record_id, target_id, True) # assumption: first column is patientnumber if type(X) == bool: return day_index = headers.index(day_id) new_X = np.zeros((0, len(headers))) new_y = [] IDs = [] IDrows = {} # ordering of time points and complete data (filled with nan's if not available) assumed! # Select the right day and normalize the columns new_index = 0 for i in range(0, X.shape[0]): if X[i, headers.index(day_id)] == day or day == -1: row = np.array(X[i, :]).reshape(-1) if not row[0] in IDs: IDs.append(row[0]) new_y.append(int(y[i])) IDrows[row[0]] = [new_index] else: IDrows[row[0]].append(new_index) new_X = np.append(new_X, np.column_stack(row), axis=0) new_index += 1 # Remove the id, the day, and the time stamp from the data and headers. new_X = np.delete(new_X, 2, 1) new_X = np.delete(new_X, 1, 1) new_X = np.delete(new_X, 0, 1) new_headers = headers[3:len(headers)] X = new_X # Remove columns with only a single value or all nans non_singular_rows = [ i for i in range(0, X.shape[1]) if len(set(util.get_non_nans(X[:, i].tolist()))) > 1 ] #print str(len(non_singular_rows)) + ' ' + str(X.shape[1]) #print non_singular_rows X = X[:, non_singular_rows] new_headers = np.array(new_headers)[non_singular_rows].tolist() max_values = np.nanmax(X, axis=0) min_values = np.nanmin(X, axis=0) ranges = [] for i in range(0, len(min_values)): diff = max_values[i] - min_values[i] if diff == 0: print 'difference of zero encountered in ' + str(i) print 'Max values: ' + str(max_values[i]) print 'Min values: ' + str(min_values[i]) ranges.append(1) else: ranges.append(diff) # Now do some scaling to get the values to the same order or magnitude scaled_X = (X - min_values) / (max_values - min_values) X = scaled_X y = np.squeeze(np.asarray(new_y)) new_IDrows = {} for ID in IDs: IDrows[ID] = { 'first_row': min(IDrows[ID]), 'last_row': max(IDrows[ID]) } print ' ...instances: {}, attributes: {}'.format( X.shape[0], X.shape[1]) # Now we are going to build the similarity matrix. We are also going to store how many attributes # we actually able to make a comparison for. similarity_matrix = np.zeros((len(IDs), len(IDs))) matching_number_matrix = np.ones((len(IDs), len(IDs))) for i in range(0, len(IDs)): for j in range(i + 1, len(IDs)): for attr in range(0, len(new_headers)): i_data = X[IDrows[IDs[i]]['first_row']: IDrows[IDs[i]]['last_row'] + 1, attr].tolist() j_data = X[IDrows[IDs[j]]['first_row']: IDrows[IDs[j]]['last_row'] + 1, attr].tolist() #print i_data #print j_data if new_headers[attr] in dtw_attr: dtw_distance = dtw.lb_keogh(i_data, j_data, window) # print dtw_distance if not dtw_distance == -1: similarity_matrix[i, j] += dtw_distance matching_number_matrix[i, j] += 1 else: i_data = util.get_non_nans(i_data) j_data = util.get_non_nans(j_data) if len(i_data) > 0 and len(j_data) > 0: simple_distance = math.pow( np.mean(i_data) - np.mean(j_data), 2) similarity_matrix[i, j] += simple_distance matching_number_matrix[i, j] += 1 similarity_matrix[j, i] = similarity_matrix[i, j] matching_number_matrix[j, i] = matching_number_matrix[i, j] similarity_matrix = similarity_matrix / matching_number_matrix # We calculate the average score per item matched # Best might be to apply a weighting scheme now. results = perform_classification(similarity_matrix, y, out_dir, k) results_list.append(results) in_out.save_results(out_dir + str(k) + '.csv', ["fpr", "tpr", "auc", "cm"], results[1:len(results)], [sum(y), len(y)]) in_out.save_ROC(out_dir + '/roc.png', results_list, title='ROC curve') # notify user print '## Learning Finished ##'
def execute(in_dir, out_dir, record_id, target_id, algorithms, feature_selection, separate_testset, in_dir_test): '''executes the learning task on the data in in_dir with the algorithms in algorithms. The results are written to out_dir and subdirectories, and the record_ and target_ids are used to differentiate attributes and non-attributes''' print '### executing learning algorithms on... ###' # get the files files = util.list_dir_csv(in_dir) # stop if no files found if not files: print 'No appropriate csv files found. Select an input directory with appropriate files' return if separate_testset: files_test = util.list_dir_csv(in_dir_test) else: files_test = files # create directory util.make_dir(out_dir) # execute each algorithm for alg in algorithms: print '...{}'.format(alg) util.make_dir(out_dir+'/'+alg+'/') results_list = [] if separate_testset: results_list2 = [] util.make_dir(out_dir+'/'+alg+'_test/') # list which will contain the results # run algorithm alg for each file f for f, f_test in zip(files,files_test): fname = in_out.get_file_name(f, extension=False) print ' ...{}'.format(fname) # get data, split in features/target. If invalid stuff happened --> exit X, y, headers = in_out.import_data(f, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target if type(X) == bool: return # if separate_testset: # X, X_te = X # y, y_te = y # print ' ...train instances: {}, attributes: {}'.format(X.shape[0], X.shape[1]) # print ' ...test instances: {}, attributes: {}'.format(X_te.shape[0], X_te.shape[1]) # else: print ' ...instances: {}, attributes: {}'.format(X.shape[0], X.shape[1]) model, best_features, results = execute_with_algorithm(alg, X, y, fname, headers, out_dir+'/'+alg+'/', record_id, target_id, feature_selection) results_list.append(results) if separate_testset: X, y, headers = in_out.import_data(f_test, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target if type(X) == bool: return print ' ...instances: {}, attributes: {} (test set)'.format(X.shape[0], X.shape[1]) results = predict_separate(X, y, fname, out_dir+'/'+alg+'_test/', record_id, target_id, feature_selection, model, best_features) results_list2.append(results) try: in_out.save_ROC(out_dir+'/'+alg+'/'+"roc.png", results_list, title='ROC curve') except IndexError: pass try: in_out.save_ROC(out_dir+'/'+alg+'_test/'+"roc.png", results_list2, title='ROC curve') except NameError: pass # notify user print '## Learning Finished ##'