def enrich_from_file(self, in_dir):
        '''enrich using a data file as source'''
        assert (in_dir != '')
        files = util.list_dir_csv(in_dir)
        med_f = util.select_file(files, 'medicatie')
        records = io.read_csv(med_f)

        headers = util.get_headers(records.next())
        idx = headers.index('atc_code')

        return self.atc_enrichment(records, idx)
Exemple #2
0
    def enrich_from_file(self, in_dir):
        '''enrich using a data file as source'''
        assert (in_dir != '')
        files = util.list_dir_csv(in_dir)
        med_f = util.select_file(files, 'journaal')
        records = io.read_csv(med_f)

        headers = util.get_headers(next(records))
        idx = headers.index('icpc')

        return self.icpc_enrichment(records, idx)
	def enrich_from_file(self, in_dir):
		'''enrich using a data file as source'''
		assert(in_dir != '')
		files = util.list_dir_csv(in_dir)
		med_f = util.select_file(files, 'journaal')
		records = io.read_csv(med_f)

		headers = util.get_headers(records.next())
		idx = headers.index('icpc')

		return self.icpc_enrichment(records, idx)
Exemple #4
0
    def process_csv(self, needs_processing):
        '''converts the specified csv's to usable data'''

        # get all csv's in the input folder
        self.files = util.list_dir_csv(self.in_dir)

        self.pickle_files = util.list_dir_pickle(self.in_dir)

        # put the IDs of the 'main' file in a dict
        if self.already_processed == True:
            try:
                ID_f = util.select_file(self.pickle_files, 'patient_dict')
                self.id2data = load_obj(ID_f)
                self.headers = ['ID', 'age', 'gender']
                print('yyy')
            except TypeError:
                ID_f = util.select_file(self.files, 'patient')
                rows, fields = util.import_data(ID_f, delim=self.delim)
                self.headers = self.get_IDs(rows, fields)

        else:
            ID_f = util.select_file(self.files, 'patient')
            rows, fields = util.import_data(ID_f, delim=self.delim)
            self.headers = self.get_IDs(rows, fields)

            if self.survival == True:
                ID_f = util.select_file(self.files, 'icpc')
                rows, fields = util.import_data(ID_f, delim=self.delim)
                self.insert_start_baseline(rows, fields)

        # add stroke value to each patient
        if self.already_processed == True:
            try:
                stroke_f = util.select_file(self.pickle_files, 'stroke_dict')
                self.id2data = load_obj(stroke_f)
                print('xxx')

            except TypeError:
                stroke_f = util.select_file(self.files, 'icpc')
                rows, fields = util.import_data(stroke_f, delim=self.delim)
                self.get_stroke_occurrences(rows, fields)
            except ValueError:
                stroke_f = util.select_file(self.files, 'icpc')
                rows, fields = util.import_data(stroke_f, delim=self.delim)
                self.get_stroke_occurrences(rows, fields)

        else:
            # add stroke value to each patient
            stroke_f = util.select_file(self.files, 'icpc')
            rows, fields = util.import_data(stroke_f, delim=self.delim)
            self.get_stroke_occurrences(rows, fields)

        # randomize dates if non-survival
        if self.survival == False:
            self.insert_data_intervals()
        else:
            self.insert_survival_intervals()

        # gather data from medication csv
        if 'medication' in needs_processing and needs_processing['medication']:
            print('...processing medication')
            if self.already_processed == True:
                try:
                    if self.survival == True:
                        self.load_data('atc0_survival', 'atc0_headers0')
                    else:
                        self.load_data('atc_dict0', 'atc_headers0')

                except TypeError:
                    print('Data not available, processing medication data')
                    self.process_medication()

                except ValueError:
                    print('Data not available, processing medication data')
                    self.process_medication()

            else:
                self.process_medication()

        # gather data from consult csv
        if 'consults' in needs_processing and needs_processing['consults']:
            print('...processing consults')
            if self.already_processed == True:
                try:
                    if self.survival == True:
                        self.load_data('consults_dict0_survival',
                                       'consults_headers0')
                    else:
                        self.load_data('consults_dict0', 'consults_headers0')

                except TypeError:
                    print('Data not available, processing medication data')
                    self.process_consults()

                except ValueError:
                    print('Data not available, processing medication data')
                    self.process_consults()

            else:
                self.process_consults()

        # gather data from verrichtingen csv
        if 'actions' in needs_processing and needs_processing['actions']:
            print('...processing actions')
            if self.already_processed == True:
                try:
                    if self.survival == True:
                        self.load_data('actions_dict0_survival',
                                       'actions_headers0')
                    else:
                        self.load_data('actions_dict0', 'actions_headers0')

                except TypeError:
                    print('Data not available, processing medication data')
                    self.process_actions()

                except ValueError:
                    print('Data not available, processing medication data')
                    self.process_actions()

            else:
                self.process_actions()

        # gather data from icpc csv
        if 'icpc' in needs_processing and needs_processing[
                'icpc']:  #IS ALLEEN DEZE GESCHIKT VOOR TEMPORAL???
            print('...processing ICPC')
            if self.already_processed == True:
                try:
                    if self.survival == True:
                        self.load_data('icpc_dict0_survival', 'icpc_headers0')
                    else:
                        self.load_data('icpc_dict0', 'icpc_headers0')

                except TypeError:
                    print('Data not available, processing medication data')
                    self.process_icpc()

                except ValueError:
                    print('Data not available, processing medication data')
                    self.process_icpc()

            else:
                self.process_icpc()

        # gather data from lab results csv
        if 'lab_results' in needs_processing and needs_processing[
                'lab_results']:
            print('...processing lab results')
            if self.already_processed == True:
                try:
                    if self.survival == True:
                        self.load_data('lab_results_dict0_survival',
                                       'lab_results_headers0')
                    else:
                        self.load_data('lab_results_dict0',
                                       'lab_results_headers0')

                except TypeError:
                    print('Data not available, processing medication data')
                    self.process_labresults()

                except ValueError:
                    print('Data not available, processing medication data')
                    self.process_labresults()

            else:
                self.process_labresults()

        # gather data from smoking file
        if 'smoking' in needs_processing and needs_processing['smoking']:
            print('...processing smoking')
            if self.already_processed == True:
                try:
                    if self.survival == True:
                        self.load_data('smoking_dict0_survival',
                                       'consults_headers0')
                        self.load_data('smoking_dict1_survival',
                                       'smoking_headers1')
                    else:
                        self.load_data('smoking_dict0', 'smoking_headers0')
                        self.load_data('smoking_dict1', 'smoking_headers1')

                except TypeError:
                    print('Data not available, processing medication data')
                    self.process_smoking()

                except ValueError:
                    print('Data not available, processing medication data')
                    self.process_smoking()

            else:
                self.process_smoking()

        if 'bmi' in needs_processing and needs_processing['bmi']:
            print('...processing bmi')
            if self.already_processed == True:
                try:
                    if self.survival == True:
                        self.load_data('bmi_dict0_survival', 'bmi_headers0')
                        self.load_data('bmi_dict1_survival', 'bmi_headers1')
                        self.load_data('bmi_dict2_survival', 'bmi_headers2')
                    else:
                        self.load_data('bmi_dict0', 'bmi_headers0')
                        self.load_data('bmi_dict1', 'bmi_headers1')
                        self.load_data('bmi_dict2', 'bmi_headers2')

                except TypeError:
                    print('Data not available, processing medication data')
                    self.process_bmi()

                except ValueError:
                    print('Data not available, processing medication data')
                    self.process_bmi()

            else:
                self.process_bmi()

        if 'allergies' in needs_processing and needs_processing['allergies']:
            print('...processing allergies')
            if self.already_processed == True:
                try:
                    if self.survival == True:
                        self.load_data('allergies_dict0_survival',
                                       'allergies_headers0')
                    else:
                        self.load_data('allergies_dict0', 'allergies_headers0')

                except TypeError:
                    print('Data not available, processing medication data')
                    self.process_allergies()

                except ValueError:
                    print('Data not available, processing medication data')
                    self.process_allergies()

            else:
                self.process_allergies()

        if 'blood_pressure' in needs_processing and needs_processing[
                'blood_pressure']:
            print('...processing blood pressure')
            if self.already_processed == True:
                try:
                    if self.survival == True:
                        self.load_data('blood_pressure_dict0_survival',
                                       'blood_pressure_headers0')
                        # self.load_data('blood_pressure_dict1_survival', 'blood_pressure_headers1')
                    else:
                        self.load_data('blood_pressure_dict0',
                                       'blood_pressure_headers0')
                        self.load_data('blood_pressure_dict1',
                                       'blood_pressure_headers1')

                except TypeError:
                    print('Data not available, processing medication data')
                    self.process_bloodpressure()

                except ValueError:
                    print('Data not available, processing medication data')
                    self.process_bloodpressure()

            else:
                self.process_bloodpressure()

        if 'alcohol' in needs_processing and needs_processing['alcohol']:
            print('...processing alcohol')
            if self.already_processed == True:
                try:
                    if self.survival == True:
                        self.load_data('alcohol_dict0_survival',
                                       'alcohol_headers0')
                    else:
                        self.load_data('alcohol_dict0', 'alcohol_headers0')

                except TypeError:
                    print('Data not available, processing medication data')
                    self.process_alcohol()

                except ValueError:
                    print('Data not available, processing medication data')
                    self.process_alcohol()

            else:
                self.process_alcohol()

        if 'renal_function' in needs_processing and needs_processing[
                'renal_function']:
            print('...processing renal function')
            if self.already_processed == True:
                try:
                    if self.survival == True:
                        self.load_data('renal_function_dict0_survival',
                                       'renal_function_headers0')
                    self.load_data('renal_function_dict0',
                                   'renal_function_headers0')

                except TypeError:
                    print('Data not available, processing medication data')
                    self.process_renalfunction()

                except ValueError:
                    print('Data not available, processing medication data')
                    self.process_renalfunction()

            else:
                self.process_renalfunction()

        if 'cardiometabolism' in needs_processing and needs_processing[
                'cardiometabolism']:
            print('...processing cardiometabolism')
            if self.already_processed == True:
                try:
                    if self.survival == True:
                        self.load_data('cardiometabolism_dict0_survival',
                                       'renal_function_headers0')
                    else:
                        self.load_data('cardiometabolism_dict0',
                                       'cardiometabolism_headers0')

                except TypeError:
                    print('Data not available, processing medication data')
                    self.process_cardiometabolism()

                except ValueError:
                    print('Data not available, processing medication data')
                    self.process_cardiometabolism()

            else:
                self.process_cardiometabolism()

        if 'lab_blood' in needs_processing and needs_processing['lab_blood']:
            print('...processing lab blood')
            if self.already_processed == True:
                try:
                    if self.survival == True:
                        self.load_data('lab_blood_dict0_survival',
                                       'lab_blood_headers0')
                    else:
                        self.load_data('lab_blood_dict0', 'lab_blood_headers0')

                except TypeError:
                    print('Data not available, processing medication data')
                    self.process_lab_blood()

                except ValueError:
                    print('Data not available, processing medication data')
                    self.process_lab_blood()

            else:
                self.process_lab_blood()

        if 'lung_function' in needs_processing and needs_processing[
                'lung_function']:
            print('...processing lung function')
            if self.already_processed == True:
                try:
                    if self.survival == True:
                        self.load_data('lung_function_dict0_survival',
                                       'lung_function_headers0')
                    else:
                        self.load_data('lung_function_dict0',
                                       'lung_function_headers0')

                except TypeError:
                    print('Data not available, processing medication data')
                    self.process_lung_function()

                except ValueError:
                    print('Data not available, processing medication data')
                    self.process_lung_function()

            else:
                self.process_lung_function()

        # move stroke indicator to end of each instance data list
        self.move_target_to_end_of_list()

        # append target element to headers, add to class var
        self.headers.append('target')
        # self.headers = headers

        to_remove = []

        for key, d in self.id2data.items():
            date_info = d['stroke_dates']
            if self.survival == True:
                print(date_info[0])
                if not isinstance(date_info[0], list):
                    if int(str(date_info[0]).split('-')[0]) < 2007:
                        to_remove.append(key)
                        continue

            else:
                if str(date_info[0]) != 'negative':
                    if int(str(date_info[0]).split('-')[0]) < 2007:
                        to_remove.append(key)
                        continue

        print(len(to_remove))
        for key in to_remove:
            del self.id2data[key]
Exemple #5
0
def execute(in_dir, out_dir, record_id, target_id, day_id, day, algorithms, feature_selection, separate_testset, in_dir_test):
	'''executes the learning task on the data in in_dir with the algorithms in algorithms.
		The results are written to out_dir and subdirectories,
	    and the record_ and target_ids are used to differentiate attributes and non-attributes'''
	print '### executing learning algorithms on... ###'

	# get the files
	files = util.list_dir_csv(in_dir)

	# stop if no files found
	if not files:
		print 'No appropriate csv files found. Select an input directory with appropriate files'
		return

	if separate_testset:
		files_test = util.list_dir_csv(in_dir_test)
	else:
		files_test = files

	# create directory
	util.make_dir(out_dir)

	# execute each algorithm
	for alg in algorithms:
		print '...{}'.format(alg)

		util.make_dir(out_dir+'/'+alg+'/')
		results_list = []
		if separate_testset:
			results_list2 = []
			util.make_dir(out_dir+'/'+alg+'_test/')

		# list which will contain the results

		# run algorithm alg for each file f
		for f, f_test in zip(files,files_test):
			fname = in_out.get_file_name(f, extension=False)
			print ' ...{}'.format(fname)

			# get data, split in features/target. If invalid stuff happened --> exit
			X, y, headers = in_out.import_data(f, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target
			if type(X) == bool: return

			# if separate_testset:
			# 	X, X_te = X
			# 	y, y_te = y
			# 	print '  ...train instances: {}, attributes: {}'.format(X.shape[0], X.shape[1])
			# 	print '  ...test instances: {}, attributes: {}'.format(X_te.shape[0], X_te.shape[1])
			# else:

			# Now remove the ones without a relevant day:

			new_headers = [h for h in headers if not h == day_id]
			day_index = headers.index(day_id)
			new_X = np.zeros((0, len(headers)))
			new_y = []

			for i in range(0, X.shape[0]):
				if X[i,headers.index(day_id)] == day:
					row = np.array(X[i,:]).reshape(-1)
					new_X = np.append(new_X, np.column_stack(row), axis=0)
					new_y.append(int(y[i]))
			new_X = np.delete(new_X, day_index, 1)
			X = new_X
			y = np.squeeze(np.asarray(new_y))

			print '  ...instances: {}, attributes: {}'.format(X.shape[0], X.shape[1])


			model, best_features, results = execute_with_algorithm(alg, X, y, fname, headers, out_dir+'/'+alg+'/', record_id, target_id, feature_selection)
			results_list.append(results)

			if separate_testset:
				X, y, headers = in_out.import_data(f_test, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target
				if type(X) == bool: return

				print '  ...instances: {}, attributes: {} (test set)'.format(X.shape[0], X.shape[1])

				results = predict_separate(X, y, fname, out_dir+'/'+alg+'_test/', record_id, target_id, feature_selection, model, best_features)
				results_list2.append(results)

		try:
			in_out.save_ROC(out_dir+'/'+alg+'/'+"roc.png", results_list, title='ROC curve')
		except IndexError:
			pass

		try:
			in_out.save_ROC(out_dir+'/'+alg+'_test/'+"roc.png", results_list2, title='ROC curve')
		except NameError:
			pass

	# notify user
	print '## Learning Finished ##'
Exemple #6
0
def execute(in_dir, out_dir, record_id, target_id, algorithms, feature_selection, separate_testset, in_dir_test, survival, oversampling, undersampling, aggregation):
	'''executes the learning task on the data in in_dir with the algorithms in algorithms.
		The results are written to out_dir and subdirectories,
	    and the record_ and target_ids are used to differentiate attributes and non-attributes'''
	print ('### executing learning algorithms on... ###')
	
	# get the files
	files = util.list_dir_csv(in_dir)

	# stop if no files found
	if not files:
		print ('No appropriate csv files found. Select an input directory with appropriate files')
		return

	if separate_testset:
		files_test = util.list_dir_csv(in_dir_test)
	else:
		files_test = files

	# create directory
	util.make_dir(out_dir)

	# execute each algorithm
	for alg in algorithms:
		print ('...{}'.format(alg))
	
		util.make_dir(out_dir+'/'+alg+'/')
		results_list = []	
		if separate_testset:
			results_list2 = []
			util.make_dir(out_dir+'/'+alg+'_test/')

		# list which will contain the results
	
		# run algorithm alg for each file f
		for f, f_test in zip(files,files_test):
			fname = in_out.get_file_name(f, extension=False)
			print (' ...{}'.format(fname))
	
			# get data, split in features/target. If invalid stuff happened --> exit
			X, y, headers, target_list = in_out.import_data(f, record_id, target_id, survival) # assumption: first column is patientnumber and is pruned, last is target
			if type(X) == bool: return
		
			if aggregation == True:
				X, headers = aggregations(f, target_list, survival)

			print ('  ...instances: {}, attributes: {}'.format(X.shape[0], X.shape[1]))

			model, best_features, results = execute_with_algorithm(alg, X, y, fname, headers, out_dir+'/'+alg+'/', record_id, target_id, feature_selection, oversampling, survival, undersampling, aggregation)
			results_list.append(results)

			if separate_testset:
				X, y, headers = in_out.import_data(f_test, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target
				if type(X) == bool: return
				
				print ('  ...instances: {}, attributes: {} (test set)'.format(X.shape[0], X.shape[1]))			

				results = predict_separate(X, y, fname, out_dir+'/'+alg+'_test/', record_id, target_id, feature_selection, model, best_features)
				results_list2.append(results)

		try:
			in_out.save_ROC(out_dir+'/'+alg+'/'+"roc.png", results_list, title='ROC curve')
		except IndexError:
			pass
		
		try:
			in_out.save_ROC(out_dir+'/'+alg+'_test/'+"roc.png", results_list2, title='ROC curve')
		except NameError:
			pass

	# notify user
	print ('## Learning Finished ##')
Exemple #7
0
    def process_csv(self, needs_processing):
        '''converts the specified csv's to usable data'''

        # get all csv's in the input folder
        files = util.list_dir_csv(self.in_dir)

        # put the IDs of the 'main' file in a dict
        ID_f = util.select_file(files, 'patient')
        rows, fields = util.import_data(ID_f, delim=self.delim)
        headers = self.get_IDs(rows, fields)

        # add CRC value to each patient
        CRC_f = util.select_file(files, 'journaal')
        rows, fields = util.import_data(CRC_f, delim=self.delim)
        self.get_CRC_occurrences(rows, fields)

        # randomize dates
        self.insert_data_intervals()

        # gather data from medication csv
        if 'medication' in needs_processing and needs_processing['medication']:
            print '...processing medication'
            med_f = util.select_file(files, 'medicatie')
            rows, fields = util.import_data(med_f, delim=self.delim)
            med_headers, self.num_med, self.num_med_pos = self.insert_data(
                rows,
                fields,
                'atc_code', ['voorschrijfdatum', 'voorschrijfdatum'],
                '[A-Z][0-9][0-9]',
                3,
                suffix='atc')
            headers = headers + med_headers

        # gather data from consult csv
        if 'consults' in needs_processing and needs_processing['consults']:
            print '...processing consults'
            consult_f = util.select_file(files, 'journaal')
            rows, fields = util.import_data(consult_f, delim=self.delim)
            consult_headers, self.num_cons, self.num_cons_pos = self.insert_data(
                rows,
                fields,
                'icpc', ['datum', 'datum'],
                '[A-Z][0-9][0-9]',
                3,
                incorporate_SOEP='soepcode')
            headers = headers + consult_headers

        # gather data from referral csv
        if 'referrals' in needs_processing and needs_processing['referrals']:
            print '...processing referrals'
            ref_f = util.select_file(files, 'verwijzing')
            rows, fields = util.import_data(ref_f, delim=self.delim)
            ref_headers, _, _ = self.insert_data(rows, fields, 'specialisme',
                                                 ['datum', 'datum'], '.*',
                                                 None)
            headers = headers + ref_headers

        # gather data from comorbidity csv
        if 'comorbidity' in needs_processing and needs_processing[
                'comorbidity']:
            print '...processing comorbidity'
            comor_f = util.select_file(files, 'comorbiditeit')
            rows, fields = util.import_data(comor_f, delim=self.delim)
            comor_headers, _, _ = self.insert_data(rows,
                                                   fields,
                                                   'omschrijving',
                                                   ['begindatum', 'einddatum'],
                                                   '.+',
                                                   None,
                                                   suffix='comorbiditeit')
            headers = headers + comor_headers

        # gather data from lab results csv
        if 'lab_results' in needs_processing and needs_processing[
                'lab_results']:
            print '...processing lab results'
            lab_f = util.select_file(files, 'bepaling')
            rows, fields = util.import_data(lab_f, delim=self.delim)
            lab_headers, self.num_lab, self.num_lab_pos = self.insert_data(
                rows,
                fields,
                'code', ['datum', 'datum'],
                '.+',
                None,
                suffix='lab_results')
            headers = headers + lab_headers

        # move CRC indicator to end of each instance data list
        self.move_target_to_end_of_list()

        # append target element to headers, add to class var
        headers.append('target')
        self.headers = headers
def execute(in_dir, out_dir, record_id, target_id, algorithms, feature_selection, separate_testset, in_dir_test):
	'''executes the learning task on the data in in_dir with the algorithms in algorithms.
		The results are written to out_dir and subdirectories,
	    and the record_ and target_ids are used to differentiate attributes and non-attributes'''
	print '### executing learning algorithms on... ###'
	
	# get the files
	files = util.list_dir_csv(in_dir)

	# stop if no files found
	if not files:
		print 'No appropriate csv files found. Select an input directory with appropriate files'
		return

	if separate_testset:
		files_test = util.list_dir_csv(in_dir_test)
	else:
		files_test = files

	# create directory
	util.make_dir(out_dir)

	# execute each algorithm
	for alg in algorithms:
		print '...{}'.format(alg)
	
		util.make_dir(out_dir+'/'+alg+'/')
		results_list = []	
		if separate_testset:
			results_list2 = []
			util.make_dir(out_dir+'/'+alg+'_test/')

		# list which will contain the results
	
		# run algorithm alg for each file f
		for f, f_test in zip(files,files_test):
			fname = in_out.get_file_name(f, extension=False)
			print ' ...{}'.format(fname)
	
			# get data, split in features/target. If invalid stuff happened --> exit
			X, y, headers = in_out.import_data(f, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target
			if type(X) == bool: return

			# if separate_testset:
			# 	X, X_te = X
			# 	y, y_te = y
			# 	print '  ...train instances: {}, attributes: {}'.format(X.shape[0], X.shape[1])
			# 	print '  ...test instances: {}, attributes: {}'.format(X_te.shape[0], X_te.shape[1])
			# else:
			print '  ...instances: {}, attributes: {}'.format(X.shape[0], X.shape[1])

			model, best_features, results = execute_with_algorithm(alg, X, y, fname, headers, out_dir+'/'+alg+'/', record_id, target_id, feature_selection)
			results_list.append(results)

			if separate_testset:
				X, y, headers = in_out.import_data(f_test, record_id, target_id) # assumption: first column is patientnumber and is pruned, last is target
				if type(X) == bool: return
				
				print '  ...instances: {}, attributes: {} (test set)'.format(X.shape[0], X.shape[1])				

				results = predict_separate(X, y, fname, out_dir+'/'+alg+'_test/', record_id, target_id, feature_selection, model, best_features)
				results_list2.append(results)

		try:
			in_out.save_ROC(out_dir+'/'+alg+'/'+"roc.png", results_list, title='ROC curve')
		except IndexError:
			pass
		
		try:
			in_out.save_ROC(out_dir+'/'+alg+'_test/'+"roc.png", results_list2, title='ROC curve')
		except NameError:
			pass

	# notify user
	print '## Learning Finished ##'
Exemple #9
0
def execute_knn(in_dir, out_dir, record_id, target_id, day_id, day, k):
    '''executes the learning task on the data in in_dir with the algorithms in algorithms.
		The results are written to out_dir and subdirectories,
	    and the record_ and target_ids are used to differentiate attributes and non-attributes'''
    print '### executing learning algorithms on... ###'

    # get the files
    files = util.list_dir_csv(in_dir)

    # stop if no files found
    if not files:
        print 'No appropriate csv files found. Select an input directory with appropriate files'
        return

    # create directory
    util.make_dir(out_dir)

    # execute each algorithm

    # run algorithm alg for each file f
    for f in files:
        results_list = []
        fname = in_out.get_file_name(f, extension=False)
        print ' ...{}'.format(fname)

        # get data, split in features/target. If invalid stuff happened --> exit
        X, y, headers = in_out.import_data(
            f, record_id, target_id,
            True)  # assumption: first column is patientnumber
        if type(X) == bool: return

        day_index = headers.index(day_id)
        new_X = np.zeros((0, len(headers)))
        new_y = []

        IDs = []
        IDrows = {}

        # ordering of time points and complete data (filled with nan's if not available) assumed!

        # Select the right day and normalize the columns
        new_index = 0
        for i in range(0, X.shape[0]):
            if X[i, headers.index(day_id)] == day or day == -1:
                row = np.array(X[i, :]).reshape(-1)

                if not row[0] in IDs:
                    IDs.append(row[0])
                    new_y.append(int(y[i]))
                    IDrows[row[0]] = [new_index]
                else:
                    IDrows[row[0]].append(new_index)
                new_X = np.append(new_X, np.column_stack(row), axis=0)
                new_index += 1

        # Remove the id, the day, and the time stamp from the data and headers.
        new_X = np.delete(new_X, 2, 1)
        new_X = np.delete(new_X, 1, 1)
        new_X = np.delete(new_X, 0, 1)
        new_headers = headers[3:len(headers)]
        X = new_X

        # Remove columns with only a single value or all nans

        non_singular_rows = [
            i for i in range(0, X.shape[1])
            if len(set(util.get_non_nans(X[:, i].tolist()))) > 1
        ]
        #print str(len(non_singular_rows)) + ' ' + str(X.shape[1])
        #print non_singular_rows

        X = X[:, non_singular_rows]
        new_headers = np.array(new_headers)[non_singular_rows].tolist()

        max_values = np.nanmax(X, axis=0)
        min_values = np.nanmin(X, axis=0)

        ranges = []
        for i in range(0, len(min_values)):
            diff = max_values[i] - min_values[i]
            if diff == 0:
                print 'difference of zero encountered in ' + str(i)
                print 'Max values: ' + str(max_values[i])
                print 'Min values: ' + str(min_values[i])
                ranges.append(1)
            else:
                ranges.append(diff)

        # Now do some scaling to get the values to the same order or magnitude
        scaled_X = (X - min_values) / (max_values - min_values)
        X = scaled_X
        y = np.squeeze(np.asarray(new_y))

        new_IDrows = {}
        for ID in IDs:
            IDrows[ID] = {
                'first_row': min(IDrows[ID]),
                'last_row': max(IDrows[ID])
            }

        print '  ...instances: {}, attributes: {}'.format(
            X.shape[0], X.shape[1])

        # Now we are going to build the similarity matrix. We are also going to store how many attributes
        # we actually able to make a comparison for.

        similarity_matrix = np.zeros((len(IDs), len(IDs)))
        matching_number_matrix = np.ones((len(IDs), len(IDs)))

        for i in range(0, len(IDs)):
            for j in range(i + 1, len(IDs)):
                for attr in range(0, len(new_headers)):
                    i_data = X[IDrows[IDs[i]]['first_row']:
                               IDrows[IDs[i]]['last_row'] + 1, attr].tolist()
                    j_data = X[IDrows[IDs[j]]['first_row']:
                               IDrows[IDs[j]]['last_row'] + 1, attr].tolist()
                    #print i_data
                    #print j_data
                    if new_headers[attr] in dtw_attr:
                        dtw_distance = dtw.lb_keogh(i_data, j_data, window)
                        # print dtw_distance
                        if not dtw_distance == -1:
                            similarity_matrix[i, j] += dtw_distance
                            matching_number_matrix[i, j] += 1
                    else:
                        i_data = util.get_non_nans(i_data)
                        j_data = util.get_non_nans(j_data)
                        if len(i_data) > 0 and len(j_data) > 0:
                            simple_distance = math.pow(
                                np.mean(i_data) - np.mean(j_data), 2)
                            similarity_matrix[i, j] += simple_distance
                            matching_number_matrix[i, j] += 1
                similarity_matrix[j, i] = similarity_matrix[i, j]
                matching_number_matrix[j, i] = matching_number_matrix[i, j]

        similarity_matrix = similarity_matrix / matching_number_matrix  # We calculate the average score per item matched
        # Best might be to apply a weighting scheme now.

        results = perform_classification(similarity_matrix, y, out_dir, k)
        results_list.append(results)

        in_out.save_results(out_dir + str(k) + '.csv',
                            ["fpr", "tpr", "auc", "cm"],
                            results[1:len(results)], [sum(y), len(y)])
        in_out.save_ROC(out_dir + '/roc.png', results_list, title='ROC curve')

    # notify user
    print '## Learning Finished ##'
Exemple #10
0
def execute_knn(in_dir, out_dir, record_id, target_id, day_id, day, k):
    '''executes the learning task on the data in in_dir with the algorithms in algorithms.
		The results are written to out_dir and sub_directories,
	    and the record_ and target_ids are used to differentiate attributes and non-attributes'''
    print '### executing learning algorithms on... ###'

    # get the files
    files = util.list_dir_csv(in_dir)

    # stop if no files found
    if not files:
        print 'No appropriate csv files found. Select an input directory with appropriate files'
        return

    # create directory
    util.make_dir(out_dir)

    # execute each algorithm

    # run algorithm alg for each file f
    for f in files:
        results_list = []
        fname = in_out.get_file_name(f, extension=False)
        print ' ...{}'.format(fname)

        # get data, split in features/target. If invalid stuff happened --> exit
        X, y, headers = in_out.import_data(
            f, record_id, target_id,
            True)  # assumption: first column is patientnumber
        if type(X) == bool: return

        day_index = headers.index(day_id)
        new_X = np.zeros((0, len(headers)))
        new_y = []

        IDs = []
        IDrows = {}

        # ordering of time points and complete data (filled with nan's if not available) assumed!

        #  		features_to_be_removed   =    [ "pvc_bin","pnc_bin","pac_bin","ect_freq_bin","full_code_bin","comfort_meas_bin","other_code_bin","no_cpr_bin",
        # 										"dnr_bin","dni_bin","fall_risk_bin","orientation_ord","orient_unable_ass_bin","riker_sas_ord","vent_bin",
        # 										"vent_mode_ord","pacemaker_bin","trach_bin","flush_skin_bin","jaundice_skin_bin","pale_skin_bin","impaired_skin_bin",
        # 										"iabp_ord","iabp_bin","svnsicu_bin","svcsicu_bin","svcsru_bin","svmicu_bin","svmsicu_bin","svother_bin","svccu_bin",
        # 										"gender"]

        exclude = [
            146, 140, 95, 123, 88, 133, 22, 65, 49, 114, 178, 55, 133, 138, 34,
            186, 20, 73
        ]
        new_index = 0
        for i in range(0, X.shape[0]):
            if X[i, headers.index(day_id)] == day or day == -1:
                row = np.array(X[i, :]).reshape(-1)

                if not row[0] in IDs and not row[0] in exclude:
                    IDs.append(row[0])
                    new_y.append(int(y[i]))
                    IDrows[row[0]] = [new_index]
                elif not row[0] in exclude:
                    IDrows[row[0]].append(new_index)
                new_X = np.append(new_X, np.column_stack(row), axis=0)
                new_index += 1

        ID_column = new_X[:, 0]

        # Remove the id, the day, and the time stamp from the data and headers.
        new_X = np.delete(new_X, 2, 1)
        new_X = np.delete(new_X, 1, 1)
        new_X = np.delete(new_X, 0, 1)
        new_headers = headers[3:len(headers)]

        dtw_attr = ['hr', 'resp', 'nbp', 'sbp', 'dbp', 'so2']

        X = new_X
        print len(X)

        non_singular_rows = [
            i for i in range(0, X.shape[1])
            if len(set(util.get_non_nans(X[:, i].tolist()))) > 1
        ]
        #print str(len(non_singular_rows)) + ' ' + str(X.shape[1])
        #print non_singular_rows

        X = X[:, non_singular_rows]
        new_headers = np.array(new_headers)[non_singular_rows].tolist()
        print str(
            len(new_headers)) + "length new headers after non singular rows"
        print new_headers
        print "Removed columns with only nan of 1 value"
        max_values = np.nanmax(X, axis=0)
        min_values = np.nanmin(X, axis=0)

        ranges = []
        for i in range(0, len(min_values)):
            diff = max_values[i] - min_values[i]
            if diff == 0:
                print 'difference of zero encountered in ' + str(i)
                print 'Max values: ' + str(max_values[i])
                print 'Min values: ' + str(min_values[i])
                ranges.append(1)
            else:
                ranges.append(diff)

        # Now do some scaling to get the values to the same order or magnitude
        scaled_X = (X - min_values) / (max_values - min_values)
        X = scaled_X
        y = np.squeeze(np.asarray(new_y))

        print "Scaling done!"

        new_IDrows = {}
        for ID in IDs:
            IDrows[ID] = {
                'first_row': min(IDrows[ID]),
                'last_row': max(IDrows[ID])
            }

        print '  ...instances: {}, attributes: {}'.format(
            X.shape[0], X.shape[1])

        # Now we are going to build the similarity matrix. We are also going to store how many attributes
        # we actually able to make a comparison for.

        similarity_matrix = np.ones((len(IDs), len(IDs)))
        matching_number_matrix = np.ones((len(IDs), len(IDs)))

        for attr in range(0, len(new_headers)):
            print str(attr) + "attribute in KNN loop"
            print str(attr) + "/" + str(len(new_headers))

            temp = np.ones((len(IDs), len(IDs)))
            temp[:] = 2
            for i in range(0, len(IDs)):
                for j in range(i + 1, len(IDs)):

                    i_data = X[IDrows[IDs[i]]['first_row']:
                               IDrows[IDs[i]]['last_row'] + 1, attr].tolist()
                    j_data = X[IDrows[IDs[j]]['first_row']:
                               IDrows[IDs[j]]['last_row'] + 1, attr].tolist()

                    if new_headers[attr] in dtw_attr:
                        dtw_distance = dtw.lb_keogh(i_data, j_data, window)

                        if not dtw_distance == -1:
                            temp[i, j] += dtw_distance
                            matching_number_matrix[i, j] += 1
                            matching_number_matrix[
                                j, i] = matching_number_matrix[i, j]
                            temp[j, i] = temp[i, j]
                    else:
                        i_data = util.get_non_nans(i_data)
                        j_data = util.get_non_nans(j_data)
                        if len(i_data) > 0 and len(j_data) > 0:
                            simple_distance = math.pow(
                                np.mean(i_data) - np.mean(j_data), 2)
                            temp[i, j] += simple_distance
                            matching_number_matrix[i, j] += 1
                            matching_number_matrix[
                                j, i] = matching_number_matrix[i, j]
                            temp[j, i] = temp[i, j]

            if np.max(temp) != 0:
                temp = temp / np.max(temp)
            similarity_matrix += temp

        # We calculate the average score per item matched
        # Best might be to apply a weighting scheme now.
        similarity_matrix = (similarity_matrix / matching_number_matrix) + (
            1 / matching_number_matrix)

        print len(IDs)
        results = perform_classification(similarity_matrix, y, out_dir, k)
        results_list.append(results)
        print results
        in_out.save_results(out_dir + str(k) + '.csv',
                            ["fpr", "tpr", "auc", "cm"],
                            results[1:len(results)], [sum(y), len(y)])
        in_out.save_ROC(out_dir + '/roc.png', results_list, title='ROC curve')

        # notify user
    print '## Learning Finished ##'
    print similarity_matrix