def feature_selection(features, targets, dataset, target, dt, knn, svm):
	[known_dataset, known_targets, unk] = split_dataset(dataset, targets)
		
	known_targets = np.asarray(known_targets)

	nr_times = int(math.floor(TOP_FEATURES_PERCENTAGE_THRESHOLD * len(features)))

	if target == 'civil':
		ssa_features = get_best(civil_all, civil_all_x, civil_all_y, nr_times)
	else:
		ssa_features = get_best(highval_all, highval_all_x, highval_all_y, nr_times)

	sf = SelectedFeatures(known_dataset, known_targets, ssa_features, features)
	ssa_dataset = sf.extract_data_from_selected_features()

	std = StandardizedData(known_targets, ssa_dataset)
	ssa_dataset_scaled, known_targets_scaled = std.split_and_standardize_dataset()  

	assert not set(known_targets).isdisjoint(known_targets_scaled)

	file_name = "ensemble_single_" + target + ".txt"
	for i in range(100):
		cv10_ensemble(ssa_dataset, known_targets, ssa_dataset_scaled, dt, knn, svm, prt=True, file_name=file_name)

	print '####### FEATURES ####### %d \n %s' % (len(ssa_features), str(ssa_features))
예제 #2
0
def feature_selection(features, targets, dataset, ids, target, one_fold_measures, standardize=False):
	[known_dataset, known_targets, unk] = split_dataset(dataset, targets)
		
	known_targets = np.asarray(known_targets)

	nr_times = int(math.floor(TOP_FEATURES_PERCENTAGE_THRESHOLD * len(features)))

	if target == 'civil':
		ssa_features = get_best(civil_all, civil_all_x, civil_all_y, nr_times)
	else:
		ssa_features = get_best(highval_all, highval_all_x, highval_all_y, nr_times)

	sf = SelectedFeatures(known_dataset, known_targets, ssa_features, features)
	ssa_dataset = sf.extract_data_from_selected_features()

	if standardize:
		std = StandardizedData(known_targets, ssa_dataset)
		ssa_dataset, known_targets = std.split_and_standardize_dataset()  

	cv10(ssa_dataset, known_targets, ids, one_fold_measures)

	print '####### FEATURES ####### %d \n %s' % (len(ssa_features), str(ssa_features))