def enrich_from_file(self, in_dir): '''enrich using a data file as source''' assert (in_dir != '') files = util.list_dir_csv(in_dir) med_f = util.select_file(files, 'medicatie') records = io.read_csv(med_f) headers = util.get_headers(records.next()) idx = headers.index('atc_code') return self.atc_enrichment(records, idx)
def enrich_from_file(self, in_dir): '''enrich using a data file as source''' assert (in_dir != '') files = util.list_dir_csv(in_dir) med_f = util.select_file(files, 'journaal') records = io.read_csv(med_f) headers = util.get_headers(next(records)) idx = headers.index('icpc') return self.icpc_enrichment(records, idx)
def enrich_from_file(self, in_dir): '''enrich using a data file as source''' assert(in_dir != '') files = util.list_dir_csv(in_dir) med_f = util.select_file(files, 'journaal') records = io.read_csv(med_f) headers = util.get_headers(records.next()) idx = headers.index('icpc') return self.icpc_enrichment(records, idx)
def process_csv(self, needs_processing): '''converts the specified csv's to usable data''' # get all csv's in the input folder self.files = util.list_dir_csv(self.in_dir) self.pickle_files = util.list_dir_pickle(self.in_dir) # put the IDs of the 'main' file in a dict if self.already_processed == True: try: ID_f = util.select_file(self.pickle_files, 'patient_dict') self.id2data = load_obj(ID_f) self.headers = ['ID', 'age', 'gender'] print('yyy') except TypeError: ID_f = util.select_file(self.files, 'patient') rows, fields = util.import_data(ID_f, delim=self.delim) self.headers = self.get_IDs(rows, fields) else: ID_f = util.select_file(self.files, 'patient') rows, fields = util.import_data(ID_f, delim=self.delim) self.headers = self.get_IDs(rows, fields) if self.survival == True: ID_f = util.select_file(self.files, 'icpc') rows, fields = util.import_data(ID_f, delim=self.delim) self.insert_start_baseline(rows, fields) # add stroke value to each patient if self.already_processed == True: try: stroke_f = util.select_file(self.pickle_files, 'stroke_dict') self.id2data = load_obj(stroke_f) print('xxx') except TypeError: stroke_f = util.select_file(self.files, 'icpc') rows, fields = util.import_data(stroke_f, delim=self.delim) self.get_stroke_occurrences(rows, fields) except ValueError: stroke_f = util.select_file(self.files, 'icpc') rows, fields = util.import_data(stroke_f, delim=self.delim) self.get_stroke_occurrences(rows, fields) else: # add stroke value to each patient stroke_f = util.select_file(self.files, 'icpc') rows, fields = util.import_data(stroke_f, delim=self.delim) self.get_stroke_occurrences(rows, fields) # randomize dates if non-survival if self.survival == False: self.insert_data_intervals() else: self.insert_survival_intervals() # gather data from medication csv if 'medication' in needs_processing and needs_processing['medication']: print('...processing medication') if self.already_processed == True: try: if self.survival == True: self.load_data('atc0_survival', 'atc0_headers0') else: self.load_data('atc_dict0', 'atc_headers0') except TypeError: print('Data not available, processing medication data') self.process_medication() except ValueError: print('Data not available, processing medication data') self.process_medication() else: self.process_medication() # gather data from consult csv if 'consults' in needs_processing and needs_processing['consults']: print('...processing consults') if self.already_processed == True: try: if self.survival == True: self.load_data('consults_dict0_survival', 'consults_headers0') else: self.load_data('consults_dict0', 'consults_headers0') except TypeError: print('Data not available, processing medication data') self.process_consults() except ValueError: print('Data not available, processing medication data') self.process_consults() else: self.process_consults() # gather data from verrichtingen csv if 'actions' in needs_processing and needs_processing['actions']: print('...processing actions') if self.already_processed == True: try: if self.survival == True: self.load_data('actions_dict0_survival', 'actions_headers0') else: self.load_data('actions_dict0', 'actions_headers0') except TypeError: print('Data not available, processing medication data') self.process_actions() except ValueError: print('Data not available, processing medication data') self.process_actions() else: self.process_actions() # gather data from icpc csv if 'icpc' in needs_processing and needs_processing[ 'icpc']: #IS ALLEEN DEZE GESCHIKT VOOR TEMPORAL??? print('...processing ICPC') if self.already_processed == True: try: if self.survival == True: self.load_data('icpc_dict0_survival', 'icpc_headers0') else: self.load_data('icpc_dict0', 'icpc_headers0') except TypeError: print('Data not available, processing medication data') self.process_icpc() except ValueError: print('Data not available, processing medication data') self.process_icpc() else: self.process_icpc() # gather data from lab results csv if 'lab_results' in needs_processing and needs_processing[ 'lab_results']: print('...processing lab results') if self.already_processed == True: try: if self.survival == True: self.load_data('lab_results_dict0_survival', 'lab_results_headers0') else: self.load_data('lab_results_dict0', 'lab_results_headers0') except TypeError: print('Data not available, processing medication data') self.process_labresults() except ValueError: print('Data not available, processing medication data') self.process_labresults() else: self.process_labresults() # gather data from smoking file if 'smoking' in needs_processing and needs_processing['smoking']: print('...processing smoking') if self.already_processed == True: try: if self.survival == True: self.load_data('smoking_dict0_survival', 'consults_headers0') self.load_data('smoking_dict1_survival', 'smoking_headers1') else: self.load_data('smoking_dict0', 'smoking_headers0') self.load_data('smoking_dict1', 'smoking_headers1') except TypeError: print('Data not available, processing medication data') self.process_smoking() except ValueError: print('Data not available, processing medication data') self.process_smoking() else: self.process_smoking() if 'bmi' in needs_processing and needs_processing['bmi']: print('...processing bmi') if self.already_processed == True: try: if self.survival == True: self.load_data('bmi_dict0_survival', 'bmi_headers0') self.load_data('bmi_dict1_survival', 'bmi_headers1') self.load_data('bmi_dict2_survival', 'bmi_headers2') else: self.load_data('bmi_dict0', 'bmi_headers0') self.load_data('bmi_dict1', 'bmi_headers1') self.load_data('bmi_dict2', 'bmi_headers2') except TypeError: print('Data not available, processing medication data') self.process_bmi() except ValueError: print('Data not available, processing medication data') self.process_bmi() else: self.process_bmi() if 'allergies' in needs_processing and needs_processing['allergies']: print('...processing allergies') if self.already_processed == True: try: if self.survival == True: self.load_data('allergies_dict0_survival', 'allergies_headers0') else: self.load_data('allergies_dict0', 'allergies_headers0') except TypeError: print('Data not available, processing medication data') self.process_allergies() except ValueError: print('Data not available, processing medication data') self.process_allergies() else: self.process_allergies() if 'blood_pressure' in needs_processing and needs_processing[ 'blood_pressure']: print('...processing blood pressure') if self.already_processed == True: try: if self.survival == True: self.load_data('blood_pressure_dict0_survival', 'blood_pressure_headers0') # self.load_data('blood_pressure_dict1_survival', 'blood_pressure_headers1') else: self.load_data('blood_pressure_dict0', 'blood_pressure_headers0') self.load_data('blood_pressure_dict1', 'blood_pressure_headers1') except TypeError: print('Data not available, processing medication data') self.process_bloodpressure() except ValueError: print('Data not available, processing medication data') self.process_bloodpressure() else: self.process_bloodpressure() if 'alcohol' in needs_processing and needs_processing['alcohol']: print('...processing alcohol') if self.already_processed == True: try: if self.survival == True: self.load_data('alcohol_dict0_survival', 'alcohol_headers0') else: self.load_data('alcohol_dict0', 'alcohol_headers0') except TypeError: print('Data not available, processing medication data') self.process_alcohol() except ValueError: print('Data not available, processing medication data') self.process_alcohol() else: self.process_alcohol() if 'renal_function' in needs_processing and needs_processing[ 'renal_function']: print('...processing renal function') if self.already_processed == True: try: if self.survival == True: self.load_data('renal_function_dict0_survival', 'renal_function_headers0') self.load_data('renal_function_dict0', 'renal_function_headers0') except TypeError: print('Data not available, processing medication data') self.process_renalfunction() except ValueError: print('Data not available, processing medication data') self.process_renalfunction() else: self.process_renalfunction() if 'cardiometabolism' in needs_processing and needs_processing[ 'cardiometabolism']: print('...processing cardiometabolism') if self.already_processed == True: try: if self.survival == True: self.load_data('cardiometabolism_dict0_survival', 'renal_function_headers0') else: self.load_data('cardiometabolism_dict0', 'cardiometabolism_headers0') except TypeError: print('Data not available, processing medication data') self.process_cardiometabolism() except ValueError: print('Data not available, processing medication data') self.process_cardiometabolism() else: self.process_cardiometabolism() if 'lab_blood' in needs_processing and needs_processing['lab_blood']: print('...processing lab blood') if self.already_processed == True: try: if self.survival == True: self.load_data('lab_blood_dict0_survival', 'lab_blood_headers0') else: self.load_data('lab_blood_dict0', 'lab_blood_headers0') except TypeError: print('Data not available, processing medication data') self.process_lab_blood() except ValueError: print('Data not available, processing medication data') self.process_lab_blood() else: self.process_lab_blood() if 'lung_function' in needs_processing and needs_processing[ 'lung_function']: print('...processing lung function') if self.already_processed == True: try: if self.survival == True: self.load_data('lung_function_dict0_survival', 'lung_function_headers0') else: self.load_data('lung_function_dict0', 'lung_function_headers0') except TypeError: print('Data not available, processing medication data') self.process_lung_function() except ValueError: print('Data not available, processing medication data') self.process_lung_function() else: self.process_lung_function() # move stroke indicator to end of each instance data list self.move_target_to_end_of_list() # append target element to headers, add to class var self.headers.append('target') # self.headers = headers to_remove = [] for key, d in self.id2data.items(): date_info = d['stroke_dates'] if self.survival == True: print(date_info[0]) if not isinstance(date_info[0], list): if int(str(date_info[0]).split('-')[0]) < 2007: to_remove.append(key) continue else: if str(date_info[0]) != 'negative': if int(str(date_info[0]).split('-')[0]) < 2007: to_remove.append(key) continue print(len(to_remove)) for key in to_remove: del self.id2data[key]
def execute(in_dir, out_dir, record_id, target_id, day_id, day, algorithms, feature_selection, separate_testset, in_dir_test): '''executes the learning task on the data in in_dir with the algorithms in algorithms. The results are written to out_dir and subdirectories, and the record_ and target_ids are used to differentiate attributes and non-attributes''' print '### executing learning algorithms on... ###' # get the files files = util.list_dir_csv(in_dir) # stop if no files found if not files: print 'No appropriate csv files found. Select an input directory with appropriate files' return if separate_testset: files_test = util.list_dir_csv(in_dir_test) else: files_test = files # create directory util.make_dir(out_dir) # execute each algorithm for alg in algorithms: print '...{}'.format(alg) util.make_dir(out_dir+'/'+alg+'/') results_list = [] if separate_testset: results_list2 = [] util.make_dir(out_dir+'/'+alg+'_test/') # list which will contain the results # run algorithm alg for each file f for f, f_test in zip(files,files_test): fname = in_out.get_file_name(f, extension=False) print ' ...{}'.format(fname) # get data, split in features/target. If invalid stuff happened --> exit X, y, headers = in_out.import_data(f, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target if type(X) == bool: return # if separate_testset: # X, X_te = X # y, y_te = y # print ' ...train instances: {}, attributes: {}'.format(X.shape[0], X.shape[1]) # print ' ...test instances: {}, attributes: {}'.format(X_te.shape[0], X_te.shape[1]) # else: # Now remove the ones without a relevant day: new_headers = [h for h in headers if not h == day_id] day_index = headers.index(day_id) new_X = np.zeros((0, len(headers))) new_y = [] for i in range(0, X.shape[0]): if X[i,headers.index(day_id)] == day: row = np.array(X[i,:]).reshape(-1) new_X = np.append(new_X, np.column_stack(row), axis=0) new_y.append(int(y[i])) new_X = np.delete(new_X, day_index, 1) X = new_X y = np.squeeze(np.asarray(new_y)) print ' ...instances: {}, attributes: {}'.format(X.shape[0], X.shape[1]) model, best_features, results = execute_with_algorithm(alg, X, y, fname, headers, out_dir+'/'+alg+'/', record_id, target_id, feature_selection) results_list.append(results) if separate_testset: X, y, headers = in_out.import_data(f_test, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target if type(X) == bool: return print ' ...instances: {}, attributes: {} (test set)'.format(X.shape[0], X.shape[1]) results = predict_separate(X, y, fname, out_dir+'/'+alg+'_test/', record_id, target_id, feature_selection, model, best_features) results_list2.append(results) try: in_out.save_ROC(out_dir+'/'+alg+'/'+"roc.png", results_list, title='ROC curve') except IndexError: pass try: in_out.save_ROC(out_dir+'/'+alg+'_test/'+"roc.png", results_list2, title='ROC curve') except NameError: pass # notify user print '## Learning Finished ##'
def execute(in_dir, out_dir, record_id, target_id, algorithms, feature_selection, separate_testset, in_dir_test, survival, oversampling, undersampling, aggregation): '''executes the learning task on the data in in_dir with the algorithms in algorithms. The results are written to out_dir and subdirectories, and the record_ and target_ids are used to differentiate attributes and non-attributes''' print ('### executing learning algorithms on... ###') # get the files files = util.list_dir_csv(in_dir) # stop if no files found if not files: print ('No appropriate csv files found. Select an input directory with appropriate files') return if separate_testset: files_test = util.list_dir_csv(in_dir_test) else: files_test = files # create directory util.make_dir(out_dir) # execute each algorithm for alg in algorithms: print ('...{}'.format(alg)) util.make_dir(out_dir+'/'+alg+'/') results_list = [] if separate_testset: results_list2 = [] util.make_dir(out_dir+'/'+alg+'_test/') # list which will contain the results # run algorithm alg for each file f for f, f_test in zip(files,files_test): fname = in_out.get_file_name(f, extension=False) print (' ...{}'.format(fname)) # get data, split in features/target. If invalid stuff happened --> exit X, y, headers, target_list = in_out.import_data(f, record_id, target_id, survival) # assumption: first column is patientnumber and is pruned, last is target if type(X) == bool: return if aggregation == True: X, headers = aggregations(f, target_list, survival) print (' ...instances: {}, attributes: {}'.format(X.shape[0], X.shape[1])) model, best_features, results = execute_with_algorithm(alg, X, y, fname, headers, out_dir+'/'+alg+'/', record_id, target_id, feature_selection, oversampling, survival, undersampling, aggregation) results_list.append(results) if separate_testset: X, y, headers = in_out.import_data(f_test, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target if type(X) == bool: return print (' ...instances: {}, attributes: {} (test set)'.format(X.shape[0], X.shape[1])) results = predict_separate(X, y, fname, out_dir+'/'+alg+'_test/', record_id, target_id, feature_selection, model, best_features) results_list2.append(results) try: in_out.save_ROC(out_dir+'/'+alg+'/'+"roc.png", results_list, title='ROC curve') except IndexError: pass try: in_out.save_ROC(out_dir+'/'+alg+'_test/'+"roc.png", results_list2, title='ROC curve') except NameError: pass # notify user print ('## Learning Finished ##')
def process_csv(self, needs_processing): '''converts the specified csv's to usable data''' # get all csv's in the input folder files = util.list_dir_csv(self.in_dir) # put the IDs of the 'main' file in a dict ID_f = util.select_file(files, 'patient') rows, fields = util.import_data(ID_f, delim=self.delim) headers = self.get_IDs(rows, fields) # add CRC value to each patient CRC_f = util.select_file(files, 'journaal') rows, fields = util.import_data(CRC_f, delim=self.delim) self.get_CRC_occurrences(rows, fields) # randomize dates self.insert_data_intervals() # gather data from medication csv if 'medication' in needs_processing and needs_processing['medication']: print '...processing medication' med_f = util.select_file(files, 'medicatie') rows, fields = util.import_data(med_f, delim=self.delim) med_headers, self.num_med, self.num_med_pos = self.insert_data( rows, fields, 'atc_code', ['voorschrijfdatum', 'voorschrijfdatum'], '[A-Z][0-9][0-9]', 3, suffix='atc') headers = headers + med_headers # gather data from consult csv if 'consults' in needs_processing and needs_processing['consults']: print '...processing consults' consult_f = util.select_file(files, 'journaal') rows, fields = util.import_data(consult_f, delim=self.delim) consult_headers, self.num_cons, self.num_cons_pos = self.insert_data( rows, fields, 'icpc', ['datum', 'datum'], '[A-Z][0-9][0-9]', 3, incorporate_SOEP='soepcode') headers = headers + consult_headers # gather data from referral csv if 'referrals' in needs_processing and needs_processing['referrals']: print '...processing referrals' ref_f = util.select_file(files, 'verwijzing') rows, fields = util.import_data(ref_f, delim=self.delim) ref_headers, _, _ = self.insert_data(rows, fields, 'specialisme', ['datum', 'datum'], '.*', None) headers = headers + ref_headers # gather data from comorbidity csv if 'comorbidity' in needs_processing and needs_processing[ 'comorbidity']: print '...processing comorbidity' comor_f = util.select_file(files, 'comorbiditeit') rows, fields = util.import_data(comor_f, delim=self.delim) comor_headers, _, _ = self.insert_data(rows, fields, 'omschrijving', ['begindatum', 'einddatum'], '.+', None, suffix='comorbiditeit') headers = headers + comor_headers # gather data from lab results csv if 'lab_results' in needs_processing and needs_processing[ 'lab_results']: print '...processing lab results' lab_f = util.select_file(files, 'bepaling') rows, fields = util.import_data(lab_f, delim=self.delim) lab_headers, self.num_lab, self.num_lab_pos = self.insert_data( rows, fields, 'code', ['datum', 'datum'], '.+', None, suffix='lab_results') headers = headers + lab_headers # move CRC indicator to end of each instance data list self.move_target_to_end_of_list() # append target element to headers, add to class var headers.append('target') self.headers = headers
def execute(in_dir, out_dir, record_id, target_id, algorithms, feature_selection, separate_testset, in_dir_test): '''executes the learning task on the data in in_dir with the algorithms in algorithms. The results are written to out_dir and subdirectories, and the record_ and target_ids are used to differentiate attributes and non-attributes''' print '### executing learning algorithms on... ###' # get the files files = util.list_dir_csv(in_dir) # stop if no files found if not files: print 'No appropriate csv files found. Select an input directory with appropriate files' return if separate_testset: files_test = util.list_dir_csv(in_dir_test) else: files_test = files # create directory util.make_dir(out_dir) # execute each algorithm for alg in algorithms: print '...{}'.format(alg) util.make_dir(out_dir+'/'+alg+'/') results_list = [] if separate_testset: results_list2 = [] util.make_dir(out_dir+'/'+alg+'_test/') # list which will contain the results # run algorithm alg for each file f for f, f_test in zip(files,files_test): fname = in_out.get_file_name(f, extension=False) print ' ...{}'.format(fname) # get data, split in features/target. If invalid stuff happened --> exit X, y, headers = in_out.import_data(f, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target if type(X) == bool: return # if separate_testset: # X, X_te = X # y, y_te = y # print ' ...train instances: {}, attributes: {}'.format(X.shape[0], X.shape[1]) # print ' ...test instances: {}, attributes: {}'.format(X_te.shape[0], X_te.shape[1]) # else: print ' ...instances: {}, attributes: {}'.format(X.shape[0], X.shape[1]) model, best_features, results = execute_with_algorithm(alg, X, y, fname, headers, out_dir+'/'+alg+'/', record_id, target_id, feature_selection) results_list.append(results) if separate_testset: X, y, headers = in_out.import_data(f_test, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target if type(X) == bool: return print ' ...instances: {}, attributes: {} (test set)'.format(X.shape[0], X.shape[1]) results = predict_separate(X, y, fname, out_dir+'/'+alg+'_test/', record_id, target_id, feature_selection, model, best_features) results_list2.append(results) try: in_out.save_ROC(out_dir+'/'+alg+'/'+"roc.png", results_list, title='ROC curve') except IndexError: pass try: in_out.save_ROC(out_dir+'/'+alg+'_test/'+"roc.png", results_list2, title='ROC curve') except NameError: pass # notify user print '## Learning Finished ##'
def execute_knn(in_dir, out_dir, record_id, target_id, day_id, day, k): '''executes the learning task on the data in in_dir with the algorithms in algorithms. The results are written to out_dir and subdirectories, and the record_ and target_ids are used to differentiate attributes and non-attributes''' print '### executing learning algorithms on... ###' # get the files files = util.list_dir_csv(in_dir) # stop if no files found if not files: print 'No appropriate csv files found. Select an input directory with appropriate files' return # create directory util.make_dir(out_dir) # execute each algorithm # run algorithm alg for each file f for f in files: results_list = [] fname = in_out.get_file_name(f, extension=False) print ' ...{}'.format(fname) # get data, split in features/target. If invalid stuff happened --> exit X, y, headers = in_out.import_data( f, record_id, target_id, True) # assumption: first column is patientnumber if type(X) == bool: return day_index = headers.index(day_id) new_X = np.zeros((0, len(headers))) new_y = [] IDs = [] IDrows = {} # ordering of time points and complete data (filled with nan's if not available) assumed! # Select the right day and normalize the columns new_index = 0 for i in range(0, X.shape[0]): if X[i, headers.index(day_id)] == day or day == -1: row = np.array(X[i, :]).reshape(-1) if not row[0] in IDs: IDs.append(row[0]) new_y.append(int(y[i])) IDrows[row[0]] = [new_index] else: IDrows[row[0]].append(new_index) new_X = np.append(new_X, np.column_stack(row), axis=0) new_index += 1 # Remove the id, the day, and the time stamp from the data and headers. new_X = np.delete(new_X, 2, 1) new_X = np.delete(new_X, 1, 1) new_X = np.delete(new_X, 0, 1) new_headers = headers[3:len(headers)] X = new_X # Remove columns with only a single value or all nans non_singular_rows = [ i for i in range(0, X.shape[1]) if len(set(util.get_non_nans(X[:, i].tolist()))) > 1 ] #print str(len(non_singular_rows)) + ' ' + str(X.shape[1]) #print non_singular_rows X = X[:, non_singular_rows] new_headers = np.array(new_headers)[non_singular_rows].tolist() max_values = np.nanmax(X, axis=0) min_values = np.nanmin(X, axis=0) ranges = [] for i in range(0, len(min_values)): diff = max_values[i] - min_values[i] if diff == 0: print 'difference of zero encountered in ' + str(i) print 'Max values: ' + str(max_values[i]) print 'Min values: ' + str(min_values[i]) ranges.append(1) else: ranges.append(diff) # Now do some scaling to get the values to the same order or magnitude scaled_X = (X - min_values) / (max_values - min_values) X = scaled_X y = np.squeeze(np.asarray(new_y)) new_IDrows = {} for ID in IDs: IDrows[ID] = { 'first_row': min(IDrows[ID]), 'last_row': max(IDrows[ID]) } print ' ...instances: {}, attributes: {}'.format( X.shape[0], X.shape[1]) # Now we are going to build the similarity matrix. We are also going to store how many attributes # we actually able to make a comparison for. similarity_matrix = np.zeros((len(IDs), len(IDs))) matching_number_matrix = np.ones((len(IDs), len(IDs))) for i in range(0, len(IDs)): for j in range(i + 1, len(IDs)): for attr in range(0, len(new_headers)): i_data = X[IDrows[IDs[i]]['first_row']: IDrows[IDs[i]]['last_row'] + 1, attr].tolist() j_data = X[IDrows[IDs[j]]['first_row']: IDrows[IDs[j]]['last_row'] + 1, attr].tolist() #print i_data #print j_data if new_headers[attr] in dtw_attr: dtw_distance = dtw.lb_keogh(i_data, j_data, window) # print dtw_distance if not dtw_distance == -1: similarity_matrix[i, j] += dtw_distance matching_number_matrix[i, j] += 1 else: i_data = util.get_non_nans(i_data) j_data = util.get_non_nans(j_data) if len(i_data) > 0 and len(j_data) > 0: simple_distance = math.pow( np.mean(i_data) - np.mean(j_data), 2) similarity_matrix[i, j] += simple_distance matching_number_matrix[i, j] += 1 similarity_matrix[j, i] = similarity_matrix[i, j] matching_number_matrix[j, i] = matching_number_matrix[i, j] similarity_matrix = similarity_matrix / matching_number_matrix # We calculate the average score per item matched # Best might be to apply a weighting scheme now. results = perform_classification(similarity_matrix, y, out_dir, k) results_list.append(results) in_out.save_results(out_dir + str(k) + '.csv', ["fpr", "tpr", "auc", "cm"], results[1:len(results)], [sum(y), len(y)]) in_out.save_ROC(out_dir + '/roc.png', results_list, title='ROC curve') # notify user print '## Learning Finished ##'
def execute_knn(in_dir, out_dir, record_id, target_id, day_id, day, k): '''executes the learning task on the data in in_dir with the algorithms in algorithms. The results are written to out_dir and sub_directories, and the record_ and target_ids are used to differentiate attributes and non-attributes''' print '### executing learning algorithms on... ###' # get the files files = util.list_dir_csv(in_dir) # stop if no files found if not files: print 'No appropriate csv files found. Select an input directory with appropriate files' return # create directory util.make_dir(out_dir) # execute each algorithm # run algorithm alg for each file f for f in files: results_list = [] fname = in_out.get_file_name(f, extension=False) print ' ...{}'.format(fname) # get data, split in features/target. If invalid stuff happened --> exit X, y, headers = in_out.import_data( f, record_id, target_id, True) # assumption: first column is patientnumber if type(X) == bool: return day_index = headers.index(day_id) new_X = np.zeros((0, len(headers))) new_y = [] IDs = [] IDrows = {} # ordering of time points and complete data (filled with nan's if not available) assumed! # features_to_be_removed = [ "pvc_bin","pnc_bin","pac_bin","ect_freq_bin","full_code_bin","comfort_meas_bin","other_code_bin","no_cpr_bin", # "dnr_bin","dni_bin","fall_risk_bin","orientation_ord","orient_unable_ass_bin","riker_sas_ord","vent_bin", # "vent_mode_ord","pacemaker_bin","trach_bin","flush_skin_bin","jaundice_skin_bin","pale_skin_bin","impaired_skin_bin", # "iabp_ord","iabp_bin","svnsicu_bin","svcsicu_bin","svcsru_bin","svmicu_bin","svmsicu_bin","svother_bin","svccu_bin", # "gender"] exclude = [ 146, 140, 95, 123, 88, 133, 22, 65, 49, 114, 178, 55, 133, 138, 34, 186, 20, 73 ] new_index = 0 for i in range(0, X.shape[0]): if X[i, headers.index(day_id)] == day or day == -1: row = np.array(X[i, :]).reshape(-1) if not row[0] in IDs and not row[0] in exclude: IDs.append(row[0]) new_y.append(int(y[i])) IDrows[row[0]] = [new_index] elif not row[0] in exclude: IDrows[row[0]].append(new_index) new_X = np.append(new_X, np.column_stack(row), axis=0) new_index += 1 ID_column = new_X[:, 0] # Remove the id, the day, and the time stamp from the data and headers. new_X = np.delete(new_X, 2, 1) new_X = np.delete(new_X, 1, 1) new_X = np.delete(new_X, 0, 1) new_headers = headers[3:len(headers)] dtw_attr = ['hr', 'resp', 'nbp', 'sbp', 'dbp', 'so2'] X = new_X print len(X) non_singular_rows = [ i for i in range(0, X.shape[1]) if len(set(util.get_non_nans(X[:, i].tolist()))) > 1 ] #print str(len(non_singular_rows)) + ' ' + str(X.shape[1]) #print non_singular_rows X = X[:, non_singular_rows] new_headers = np.array(new_headers)[non_singular_rows].tolist() print str( len(new_headers)) + "length new headers after non singular rows" print new_headers print "Removed columns with only nan of 1 value" max_values = np.nanmax(X, axis=0) min_values = np.nanmin(X, axis=0) ranges = [] for i in range(0, len(min_values)): diff = max_values[i] - min_values[i] if diff == 0: print 'difference of zero encountered in ' + str(i) print 'Max values: ' + str(max_values[i]) print 'Min values: ' + str(min_values[i]) ranges.append(1) else: ranges.append(diff) # Now do some scaling to get the values to the same order or magnitude scaled_X = (X - min_values) / (max_values - min_values) X = scaled_X y = np.squeeze(np.asarray(new_y)) print "Scaling done!" new_IDrows = {} for ID in IDs: IDrows[ID] = { 'first_row': min(IDrows[ID]), 'last_row': max(IDrows[ID]) } print ' ...instances: {}, attributes: {}'.format( X.shape[0], X.shape[1]) # Now we are going to build the similarity matrix. We are also going to store how many attributes # we actually able to make a comparison for. similarity_matrix = np.ones((len(IDs), len(IDs))) matching_number_matrix = np.ones((len(IDs), len(IDs))) for attr in range(0, len(new_headers)): print str(attr) + "attribute in KNN loop" print str(attr) + "/" + str(len(new_headers)) temp = np.ones((len(IDs), len(IDs))) temp[:] = 2 for i in range(0, len(IDs)): for j in range(i + 1, len(IDs)): i_data = X[IDrows[IDs[i]]['first_row']: IDrows[IDs[i]]['last_row'] + 1, attr].tolist() j_data = X[IDrows[IDs[j]]['first_row']: IDrows[IDs[j]]['last_row'] + 1, attr].tolist() if new_headers[attr] in dtw_attr: dtw_distance = dtw.lb_keogh(i_data, j_data, window) if not dtw_distance == -1: temp[i, j] += dtw_distance matching_number_matrix[i, j] += 1 matching_number_matrix[ j, i] = matching_number_matrix[i, j] temp[j, i] = temp[i, j] else: i_data = util.get_non_nans(i_data) j_data = util.get_non_nans(j_data) if len(i_data) > 0 and len(j_data) > 0: simple_distance = math.pow( np.mean(i_data) - np.mean(j_data), 2) temp[i, j] += simple_distance matching_number_matrix[i, j] += 1 matching_number_matrix[ j, i] = matching_number_matrix[i, j] temp[j, i] = temp[i, j] if np.max(temp) != 0: temp = temp / np.max(temp) similarity_matrix += temp # We calculate the average score per item matched # Best might be to apply a weighting scheme now. similarity_matrix = (similarity_matrix / matching_number_matrix) + ( 1 / matching_number_matrix) print len(IDs) results = perform_classification(similarity_matrix, y, out_dir, k) results_list.append(results) print results in_out.save_results(out_dir + str(k) + '.csv', ["fpr", "tpr", "auc", "cm"], results[1:len(results)], [sum(y), len(y)]) in_out.save_ROC(out_dir + '/roc.png', results_list, title='ROC curve') # notify user print '## Learning Finished ##' print similarity_matrix