def get_improve_prob(features, n_tests, dataset, folds, n_proc, p_features, shared_lock, com, cost, gamma, svm_kernel): test_sets = utils.generate_random_sets(features, n_tests, 3, 10) accs = [] for test in test_sets: ##test each dataset agaisnt the classifier acc = classification_part.classify(folds, dataset, test, cost, gamma, svm_kernel) accs.append(acc) ##split tests among processes #print "accuracies subsets:", acc workers = [] for i in range(1, n_proc): p = mp.Process(target=calculate_improve_prob, args=(i, p_features[i], dataset, test_sets, accs, folds, shared_lock, com, cost, gamma, svm_kernel)) workers.append(p) p.start() calculate_improve_prob(0, p_features[0], dataset, test_sets, accs, folds, shared_lock, com, cost, gamma, svm_kernel) for w in workers: w.join() return
def calculate_improve_prob(id, my_features, dataset, test_sets, accuracies, folds, lock, com, cost, gamma, svm_kernel): ##my_features returns a list with ml subsets, its easier if we pass all of them to a list of fts fts = [] for i in my_features: fts.append(i.features) probs = [] for ft in fts: ##for every feature test against every subset improvements = 0 for i in range(0, len(test_sets)): new_set = list(test_sets[i]) if ft in new_set: ##if ft was already on set remove it and check if the accuracy got worse new_set.remove(ft) added = False else: ## if ft was not on the dataset check if accuracy improved new_set = new_set + ft added = True #print new_set acc = classification_part.classify(folds, dataset, new_set, cost, gamma, svm_kernel) if added: if acc > accuracies[i]: improvements += 1 else: if acc < accuracies[i]: improvements += 1 #print "obtained acc:", acc, "other acc:", accuracies[i], improvements prob = round((improvements / float(len(test_sets))), 2) probs.append((ft[0], prob)) ##send results to pipe lock.acquire() output, input = com if output.poll(): msg = output.recv( ) ##add what already was there and append this process results probs = msg + probs input.send(probs) lock.release()
def get_improve_prob(features, n_tests, dataset, folds, n_proc, p_features, shared_lock, com, cost, gamma, svm_kernel): test_sets = utils.generate_random_sets(features, n_tests, 3, 10) accs = [] for test in test_sets: ##test each dataset agaisnt the classifier acc = classification_part.classify(folds, dataset, test, cost, gamma, svm_kernel) accs.append(acc) ##split tests among processes # print "accuracies subsets:", acc workers = [] for i in range(1, n_proc): p = mp.Process( target=calculate_improve_prob, args=(i, p_features[i], dataset, test_sets, accs, folds, shared_lock, com, cost, gamma, svm_kernel), ) workers.append(p) p.start() calculate_improve_prob(0, p_features[0], dataset, test_sets, accs, folds, shared_lock, com, cost, gamma, svm_kernel) for w in workers: w.join() return
def calculate_improve_prob(id, my_features, dataset, test_sets, accuracies, folds, lock, com, cost, gamma, svm_kernel): ##my_features returns a list with ml subsets, its easier if we pass all of them to a list of fts fts = [] for i in my_features: fts.append(i.features) probs = [] for ft in fts: ##for every feature test against every subset improvements = 0 for i in range(0, len(test_sets)): new_set = list(test_sets[i]) if ft in new_set: ##if ft was already on set remove it and check if the accuracy got worse new_set.remove(ft) added = False else: ## if ft was not on the dataset check if accuracy improved new_set = new_set + ft added = True # print new_set acc = classification_part.classify(folds, dataset, new_set, cost, gamma, svm_kernel) if added: if acc > accuracies[i]: improvements += 1 else: if acc < accuracies[i]: improvements += 1 # print "obtained acc:", acc, "other acc:", accuracies[i], improvements prob = round((improvements / float(len(test_sets))), 2) probs.append((ft[0], prob)) ##send results to pipe lock.acquire() output, input = com if output.poll(): msg = output.recv() ##add what already was there and append this process results probs = msg + probs input.send(probs) lock.release()
def nips_validation(data, best_subsets, mi_scores, params_norm, used_bins, cost, gamma, settings): dataset_name = settings.dataset_name ##read the 3 sets of data: train, validation and test if settings.dataset_type == "dense": ##dense type from NIPS data_train = prepare_data.import_nips_dense(settings.file_train[0], settings.file_train[1]) data_valid = prepare_data.import_nips_dense(settings.file_valid[0], settings.file_valid[1]) data_test = prepare_data.import_nips_dense(settings.file_test, "") elif settings.dataset_type == "sparse_binary": ##sparse_binary type from NIPS data_train = prepare_data.import_nips_sparse_binary( settings.file_train[0], settings.file_train[1], settings.number_features) data_valid = prepare_data.import_nips_sparse_binary( settings.file_valid[0], settings.file_valid[1], settings.number_features) data_test = prepare_data.import_nips_sparse_binary( settings.file_test[0], "", settings.number_features) elif settings.dataset_type == "sparse_integer": ##sparse_integer type from NIPS data_train = prepare_data.import_nips_sparse_integer( settings.file_train[0], settings.file_train[1], settings.number_features) data_valid = prepare_data.import_nips_sparse_integer( settings.file_valid[0], settings.file_valid[1], settings.number_features) data_test = prepare_data.import_nips_sparse_integer( settings.file_test[0], "", settings.number_features) ##normalize the 3 sets with the normalization parameters used during the feature selection process data_train = prepare_data.apply_normalization(data_train, params_norm) data_valid = prepare_data.apply_normalization(data_valid, params_norm) data_test = prepare_data.apply_normalization(data_test, params_norm) validation_results = {} ##save results of validation ##create variables to test and find the accuracy of the train and valid sets aux_data_1 = data + data_train folds_1 = [(range(0, len(data)), range(len(data), len(data) + len(data_train)))] aux_data_2 = data + data_valid folds_2 = [(range(0, len(data)), range(len(data), len(data) + len(data_valid)))] for i in range(0, len(best_subsets) ): ##test every subset and check which generalizes best acc_train = classification_part.classify(folds_1, aux_data_1, best_subsets[i][0], cost, gamma, settings.svm_kernel) acc_valid = classification_part.classify(folds_2, aux_data_2, best_subsets[i][0], cost, gamma, settings.svm_kernel) validation_results[i] = (acc_train, acc_valid) ##selection the subset that was able to obtain the best score for both sets... this could be changed top_score_1 = 0.0 top_score_2 = 0.0 top_subset = "" top_score = 0.0 for i in validation_results: print best_subsets[i][0], validation_results[i] score_1 = validation_results[i][0] score_2 = validation_results[i][1] if score_1 + score_2 > top_score: top_score = score_1 + score_2 top_score_1 = score_1 top_score_2 = score_2 top_subset = best_subsets[i][0] elif score_1 + score_2 == top_score: ##case where they have same percentage if abs(score_1 - score_2) < abs(top_score_1 - top_score_2): top_score = score_1 + score_2 top_score_1 = score_1 top_score_2 = score_2 top_subset = best_subsets[i][0] print top_score_1, top_score_2, "selected subset:", top_subset ##create the nips file for each set classify_data(data, top_subset, dataset_name + "_train", data_train, cost, gamma, settings.svm_kernel) classify_data(data, top_subset, dataset_name + "_valid", data_valid, cost, gamma, settings.svm_kernel) classify_data(data, top_subset, dataset_name + "_test", data_test, cost, gamma, settings.svm_kernel) ##write the selected features to the file using the MI score as sort criterion top_subset = order_importance_of_features(top_subset, mi_scores) f_fts = open("results/" + dataset_name + ".feat", "a") for ft in top_subset: f_fts.write(str(int(ft) + 1) + "\n") f_fts.close()
def nips_validation(data, best_subsets, mi_scores, params_norm, used_bins, cost, gamma, settings): dataset_name = settings.dataset_name ##read the 3 sets of data: train, validation and test if settings.dataset_type == "dense": ##dense type from NIPS data_train = prepare_data.import_nips_dense(settings.file_train[0], settings.file_train[1]) data_valid = prepare_data.import_nips_dense(settings.file_valid[0], settings.file_valid[1]) data_test = prepare_data.import_nips_dense(settings.file_test, "") elif settings.dataset_type == "sparse_binary": ##sparse_binary type from NIPS data_train = prepare_data.import_nips_sparse_binary(settings.file_train[0], settings.file_train[1], settings.number_features) data_valid = prepare_data.import_nips_sparse_binary(settings.file_valid[0], settings.file_valid[1], settings.number_features) data_test = prepare_data.import_nips_sparse_binary(settings.file_test[0], "", settings.number_features) elif settings.dataset_type == "sparse_integer": ##sparse_integer type from NIPS data_train = prepare_data.import_nips_sparse_integer(settings.file_train[0], settings.file_train[1], settings.number_features) data_valid = prepare_data.import_nips_sparse_integer(settings.file_valid[0], settings.file_valid[1], settings.number_features) data_test = prepare_data.import_nips_sparse_integer(settings.file_test[0], "", settings.number_features) ##normalize the 3 sets with the normalization parameters used during the feature selection process data_train = prepare_data.apply_normalization(data_train, params_norm) data_valid = prepare_data.apply_normalization(data_valid, params_norm) data_test = prepare_data.apply_normalization(data_test, params_norm) validation_results = {} ##save results of validation ##create variables to test and find the accuracy of the train and valid sets aux_data_1 = data + data_train folds_1 = [(range(0,len(data)), range(len(data), len(data) + len(data_train)))] aux_data_2 = data + data_valid folds_2 = [(range(0,len(data)), range(len(data), len(data) + len(data_valid)))] for i in range(0, len(best_subsets)): ##test every subset and check which generalizes best acc_train = classification_part.classify(folds_1, aux_data_1, best_subsets[i][0], cost, gamma, settings.svm_kernel) acc_valid = classification_part.classify(folds_2, aux_data_2, best_subsets[i][0], cost, gamma, settings.svm_kernel) validation_results[i] = (acc_train, acc_valid) ##selection the subset that was able to obtain the best score for both sets... this could be changed top_score_1 = 0.0 top_score_2 = 0.0 top_subset = "" top_score = 0.0 for i in validation_results: print best_subsets[i][0], validation_results[i] score_1 = validation_results[i][0] score_2 = validation_results[i][1] if score_1 + score_2 > top_score: top_score = score_1 + score_2 top_score_1 = score_1 top_score_2 = score_2 top_subset = best_subsets[i][0] elif score_1 + score_2 == top_score: ##case where they have same percentage if abs(score_1 - score_2) < abs(top_score_1 - top_score_2): top_score = score_1 + score_2 top_score_1 = score_1 top_score_2 = score_2 top_subset = best_subsets[i][0] print top_score_1, top_score_2 , "selected subset:", top_subset ##create the nips file for each set classify_data(data, top_subset, dataset_name + "_train", data_train, cost, gamma, settings.svm_kernel) classify_data(data, top_subset, dataset_name + "_valid", data_valid, cost, gamma, settings.svm_kernel) classify_data(data, top_subset, dataset_name + "_test", data_test, cost, gamma, settings.svm_kernel) ##write the selected features to the file using the MI score as sort criterion top_subset = order_importance_of_features(top_subset, mi_scores) f_fts = open("results/" + dataset_name + ".feat", "a") for ft in top_subset: f_fts.write(str(int(ft)+1) + "\n") f_fts.close()
def worker_classification(id, work_list, comb_memory, data, features, folds, settings, pipes, lock, global_info, probs, cost, gamma, score_lock, work_lock): n_proc = settings.number_proc t_work_ask = 0 worker_start = time.time() mec_cut_nodes = 0 ##number of nodes removed by the second phase mechanism #filename = "outputs/out" + str(id) + ".txt" ## to save outputs in case its needed best_sets = {} update_rate = 10 count = update_rate number_of_tests = 0 tested_sets = {} ##to save own tests on process #info = [] rts = [] depth = False last_update = 0.0 ##last time it updated the best global accuracy wasted_time = 0.0 ##debug to count wasted time in exchanging work send_time = 0.0 times_work_not_sent = 0 while (True): rt = time.time() ##to check how many time it takes to process a subset ##if process has no work if work_list == []: ##ask for work t_work_ask += 1 ## to count how many times this process requested work ask_time = time.time() ## measure time to ask #dont let processes ask for work right after start, make them wait x seconds in order to other processes have time to generate enough work to share if number_of_tests < 25: ## the last process when generates their expansions all are repeated and it runs out of work quickly so we have to make him wait before testing time.sleep(4) work_list = ask_for_work(id, global_info, lock, pipes, n_proc, work_lock) aux_time = time.time() - ask_time aux_time = round(aux_time,2) wasted_time += aux_time if work_list == []: ##if it received no work break ##gather a subset to test and remove it from work test_subset = work_list[len(work_list)-1] ##get the subset to test del(work_list[len(work_list)-1]) ##delete the subset from the work list ##according to sources removing from the end oof the list is much faster than removing from the end if depth: work_list, last_update, aux_cut = check_cutting(id, last_update, global_info, work_list, settings.search_cutting_time) mec_cut_nodes += aux_cut else: ##switch to depth first search and activate the sampling if len(test_subset.features) > settings.change_search_size: ##switch stages search print "AT_SWITCH:" + str(number_of_tests) + "," + str(len(work_list)) last_update = time.time() ##time to start measure the updates depth = True ##classify subset score = classification_part.classify(folds, data, test_subset.features, cost, gamma, settings.svm_kernel) # info.append((test_subset.features, score)) ##to save all results on count test_subset.parents_scores.append(score) number_of_tests += 1 ##increase number of tests if checkExpand(test_subset, global_info, depth, settings): ##if it's worth expand the node work_list = expand_node(id, work_list, comb_memory, test_subset, features, n_proc, tested_sets, depth, probs, settings.estimate_probs) ##expand the node last_update = update_score(global_info, score, test_subset, best_sets, score_lock, last_update) ##update the top scores rts.append(time.time() - rt) ##update global information count -= 1 ##if process has chosen to send if global_info.chosen_to_send == id: ##i have been chosen to send work to someone stime = time.time() work_list, aux = send_work(id, work_list, global_info, pipes, lock, work_lock) times_work_not_sent += aux aux_time = time.time() - stime aux_time = round(aux_time,2) send_time += aux_time count = update_rate ##to update the work list if count < 0: ##update global size of work for the process ##this is used to give some feedback to the user from time to time, also to globally update the amount of the work of the process count = update_rate##number of tests untill output again update_my_work(id, work_list, global_info, work_lock) ##debug info sum = 0 for r in rts: sum += r avg = sum / float(len(rts)) avg = round(avg, 4) print id, ",", avg, "," , max(best_sets), ",", int(time.time() - worker_start), "," , len(work_list) , ",", str(best_sets[max(best_sets)].features) rts = [] total_working_time = time.time() - worker_start #file_write = open(filename, "a") #for t in debug_data: # file_write.write(str(debug_data[t][0]) + "," + str(debug_data[t][1]) + "," + str(debug_data[t][2]) + "\n") #file_write.close() lock.acquire() out_file = open("res.csv", "a") out_file.write("PROCESS" + str(id) + "," + str(total_working_time) + "," + str(number_of_tests) + "," + str(max(best_sets)) + "," +str(mec_cut_nodes) + "," + str(wasted_time) + "," + str(send_time) + "," + str(t_work_ask) + ","+ str(times_work_not_sent) + "\n") for score in best_sets: set_info = "" for ft in best_sets[score].features: set_info += str(ft) + "," set_info += str(score) + "\n" out_file.write(set_info) out_file.close() lock.release()
def worker_classification(id, work_list, comb_memory, data, features, folds, settings, pipes, lock, global_info, probs, cost, gamma, score_lock, work_lock): n_proc = settings.number_proc t_work_ask = 0 worker_start = time.time() mec_cut_nodes = 0 ##number of nodes removed by the second phase mechanism #filename = "outputs/out" + str(id) + ".txt" ## to save outputs in case its needed best_sets = {} update_rate = 10 count = update_rate number_of_tests = 0 tested_sets = {} ##to save own tests on process #info = [] rts = [] depth = False last_update = 0.0 ##last time it updated the best global accuracy wasted_time = 0.0 ##debug to count wasted time in exchanging work send_time = 0.0 times_work_not_sent = 0 while (True): rt = time.time() ##to check how many time it takes to process a subset ##if process has no work if work_list == []: ##ask for work t_work_ask += 1 ## to count how many times this process requested work ask_time = time.time() ## measure time to ask #dont let processes ask for work right after start, make them wait x seconds in order to other processes have time to generate enough work to share if number_of_tests < 25: ## the last process when generates their expansions all are repeated and it runs out of work quickly so we have to make him wait before testing time.sleep(4) work_list = ask_for_work(id, global_info, lock, pipes, n_proc, work_lock) aux_time = time.time() - ask_time aux_time = round(aux_time, 2) wasted_time += aux_time if work_list == []: ##if it received no work break ##gather a subset to test and remove it from work test_subset = work_list[len(work_list) - 1] ##get the subset to test del ( work_list[len(work_list) - 1] ) ##delete the subset from the work list ##according to sources removing from the end oof the list is much faster than removing from the end if depth: work_list, last_update, aux_cut = check_cutting( id, last_update, global_info, work_list, settings.search_cutting_time) mec_cut_nodes += aux_cut else: ##switch to depth first search and activate the sampling if len(test_subset.features ) > settings.change_search_size: ##switch stages search print "AT_SWITCH:" + str(number_of_tests) + "," + str( len(work_list)) last_update = time.time() ##time to start measure the updates depth = True ##classify subset score = classification_part.classify(folds, data, test_subset.features, cost, gamma, settings.svm_kernel) # info.append((test_subset.features, score)) ##to save all results on count test_subset.parents_scores.append(score) number_of_tests += 1 ##increase number of tests if checkExpand(test_subset, global_info, depth, settings): ##if it's worth expand the node work_list = expand_node(id, work_list, comb_memory, test_subset, features, n_proc, tested_sets, depth, probs, settings.estimate_probs) ##expand the node last_update = update_score(global_info, score, test_subset, best_sets, score_lock, last_update) ##update the top scores rts.append(time.time() - rt) ##update global information count -= 1 ##if process has chosen to send if global_info.chosen_to_send == id: ##i have been chosen to send work to someone stime = time.time() work_list, aux = send_work(id, work_list, global_info, pipes, lock, work_lock) times_work_not_sent += aux aux_time = time.time() - stime aux_time = round(aux_time, 2) send_time += aux_time count = update_rate ##to update the work list if count < 0: ##update global size of work for the process ##this is used to give some feedback to the user from time to time, also to globally update the amount of the work of the process count = update_rate ##number of tests untill output again update_my_work(id, work_list, global_info, work_lock) ##debug info sum = 0 for r in rts: sum += r avg = sum / float(len(rts)) avg = round(avg, 4) print id, ",", avg, ",", max(best_sets), ",", int( time.time() - worker_start), ",", len(work_list), ",", str( best_sets[max(best_sets)].features) rts = [] total_working_time = time.time() - worker_start #file_write = open(filename, "a") #for t in debug_data: # file_write.write(str(debug_data[t][0]) + "," + str(debug_data[t][1]) + "," + str(debug_data[t][2]) + "\n") #file_write.close() lock.acquire() out_file = open("res.csv", "a") out_file.write("PROCESS" + str(id) + "," + str(total_working_time) + "," + str(number_of_tests) + "," + str(max(best_sets)) + "," + str(mec_cut_nodes) + "," + str(wasted_time) + "," + str(send_time) + "," + str(t_work_ask) + "," + str(times_work_not_sent) + "\n") for score in best_sets: set_info = "" for ft in best_sets[score].features: set_info += str(ft) + "," set_info += str(score) + "\n" out_file.write(set_info) out_file.close() lock.release()