def feature_selection_before(features, targets, dataset, percentage, ids, one_fold_measures, standardize=False, prt=False, file_name=None): [known_dataset, known_targets, unk] = split_dataset(dataset, targets) known_targets = np.asarray(known_targets) # these come from feature_selection_cv # commented out because they were saved to decrease computation time # cv_features = features_cross_validation(known_dataset, known_targets, features) # selected_features = select_final_features_from_cv(cv_features, percentage) selected_features = select_features(percentage) sf = SelectedFeatures(known_dataset, known_targets, selected_features, features) known_dataset = sf.extract_data_from_selected_features() if standardize: std = StandardizedData(known_targets, known_dataset) known_dataset, known_targets = std.split_and_standardize_dataset() cv10(np.array(known_dataset), known_targets, ids, one_fold_measures, prt, file_name) print '####### FEATURES ####### %d \n %s' % (len(selected_features), str(selected_features))
def feature_selection(features, targets, dataset, ids, target, one_fold_measures, standardize=False): [known_dataset, known_targets, unk] = split_dataset(dataset, targets) known_targets = np.asarray(known_targets) nr_times = int(math.floor(TOP_FEATURES_PERCENTAGE_THRESHOLD * len(features))) if target == 'civil': ssa_features = get_best(civil_all, civil_all_x, civil_all_y, nr_times) else: ssa_features = get_best(highval_all, highval_all_x, highval_all_y, nr_times) sf = SelectedFeatures(known_dataset, known_targets, ssa_features, features) ssa_dataset = sf.extract_data_from_selected_features() if standardize: std = StandardizedData(known_targets, ssa_dataset) ssa_dataset, known_targets = std.split_and_standardize_dataset() cv10(ssa_dataset, known_targets, ids, one_fold_measures) print '####### FEATURES ####### %d \n %s' % (len(ssa_features), str(ssa_features))
Single KNN """ print(__doc__) import sys sys.path.insert(0, 'utils/') from load_data import * from project_data import * from parse_theme import * from standardized_data import * from cv import cv10 from cv import knn_one_fold_measures import numpy as np if __name__ == "__main__": spreadsheet = Spreadsheet(project_data_file) data = Data(spreadsheet) targets = data.targets ids = data.ids try: [dataset, features] = parse_theme(sys.argv[1]) std = StandardizedData(targets, dataset) known_dataset_scaled, known_targets = std.split_and_standardize_dataset() cv10(known_dataset_scaled, known_targets, ids, knn_one_fold_measures) except IndexError: print "Error!! Pass 'all' as argument"
Logistic Regression Classification Single LR """ print(__doc__) import sys sys.path.insert(0, 'utils/') from load_data import * from project_data import * from parse_theme import * from split_dataset import * from cv import cv10 from cv import lr_one_fold_measures import numpy as np if __name__ == "__main__": spreadsheet = Spreadsheet(project_data_file) data = Data(spreadsheet) targets = data.targets ids = data.ids try: [dataset, features] = parse_theme(sys.argv[1]) [known_dataset, known_targets, unk] = split_dataset(dataset, targets) cv10(np.array(known_dataset), np.array(known_targets), ids, lr_one_fold_measures) except IndexError: print "Error!! Pass 'all' as argument"
print(__doc__) import sys sys.path.insert(0, 'utils/') from load_data import * from project_data import * from parse_theme import * from standardized_data import * from cv import cv10 from cv import single_svm_one_fold_measures if __name__ == "__main__": spreadsheet = Spreadsheet(project_data_file) data = Data(spreadsheet) targets = data.targets ids = data.ids try: [dataset, features] = parse_theme(sys.argv[1]) std = StandardizedData(targets, dataset) known_dataset_scaled, known_targets = std.split_and_standardize_dataset() cv10(known_dataset_scaled, known_targets, ids, single_svm_one_fold_measures) except IndexError: print "Error!! Pass 'all' as argument"