def classify_final_test(data, features, data_to_classify, cost, g, svm_kernel): ##make all data becoming part of the training train_set, _ = utils.cut_dataset(range(len(data)), [], data, features) _, test_set = utils.cut_dataset([], range(len(data_to_classify)), data_to_classify, features) if svm_kernel == "linear": clf = svm.LinearSVC(C=cost) #used for gissete and madelon else: clf = svm.SVC(C=cost, gamma=g, kernel='rbf') # clf.fit(train_set.fts_values, train_set.fts_pred) predictions = clf.predict(test_set.fts_values) preds = [] for i in predictions: #print i preds.append(int(i)) return preds
def classify(folds, data, features, cost, gamma, svm_kernel): ##train_set, test_set = utils.cut_dataset(n_train, n_test, data, test_subset.features) ##get the features for the subset scores = [] for fold in folds: ##run a classification for each fold train, test = fold ##division of data on that fold train_set, test_set = utils.cut_dataset(train, test, data, features) ##print train_set.fts_pred, test_set.fts_pred score = classification_svm(train_set, test_set, cost, gamma, svm_kernel) if math.isnan(score): #print "nan score" score = 0 scores.append(score) #print scores final_score = sum(scores) / float(len(scores)) return final_score
def worker_classification(id, work_list, data, features, n_train, n_test, data_pipe, com_pipe): worker_time = time.time() ##to know how much time did the worker work best_acc = 0 lowest_acc = 0 wl_dict = {} ##to keep all the generated subsets on this slave best_sets = {} last_test_sets = {} ##new test set since last talk com with manager total_wasted = 0 testing = True number_of_tests = 0 comm_rate = 60 counter_time = time.time() talk_counter = 0 talk_avgs = [] talk_time = time.time() #print "started ", id, " with work ", translate_wl(work_list) while (testing): if work_list == []: ##the last test sets will be sent on next talk to the manager if number_of_tests < 30: print id, " waiting cause ran out of work too soon" time.sleep(10) work_list, best_acc, wasted_time = ask_for_work(id, data_pipe, com_pipe) total_wasted += wasted_time counter = comm_rate#random.randint(50, 150) #Since we need to send the last test sents to manager lets set this to a lower value if work_list == []: ##didnt receive work testing = False break test_subset = work_list[len(work_list)-1] ##get the subset to test del(work_list[len(work_list)-1]) ##delete the subset from the work list ##according to sources removing from the end oof the list is much faster than removing from the end cannonical_name = ','.join(str(e) for e in test_subset.features) ##add the canonical name to the tested_sets train_set, test_set = utils.cut_dataset(n_train, n_test, data, test_subset.features) ##get the features for the subset acc = classification_svm(train_set, test_set) ##train and test the dataset test_subset.parents_accuracy.append(acc) number_of_tests += 1 last_test_sets[cannonical_name] = True ##add to last sets #debug_data[cannonical_name] = (test_subset.features, test_subset.parents_accuracy) if checkExpand(test_subset, best_acc): new_combs, r_counter = get_combinations(work_list, test_subset, features, wl_dict) ##check for new combinantions work_list = new_combs + work_list ##add new combinations to the list if acc > lowest_acc: ##check if the subset is better then the previous ones if acc > best_acc: ## if acc is the best one found so far best_acc = acc lowest_acc = update_best_sets(best_sets, acc, test_subset) #counter -= 1 #rts.append(time.time() - round_st) if (time.time() - counter_time) > comm_rate and testing: ## try to talk to manager checked = False; work_list, checked, wasted_time, acc = manager_control(id, work_list, last_test_sets, data_pipe, com_pipe, best_acc) wasted_time = 0.0 total_wasted += wasted_time #print "PROCESS ", id , " Number of tests: ", number_of_tests, " in ", round(time.time() - worker_time, 2) if checked: ##worker communicated with manager talk_avgs.append(time.time() - talk_time) talk_counter += 1 #print "PROCESS", id, " WORK SIZE ", len(work_list), "round time: ", rt ," expand count ", expand_count, " best acc:", max(best_sets), " global acc: ", best_acc print "PROCESS ", id , " Number of tests: ", number_of_tests last_test_sets = {} ##reset the tested list so less information is passed to the manager next time talk_time = time.time() counter_time = time.time() total_working_time = time.time() - worker_time print "PROCESS", id, ",waste_t:", total_wasted, ",work_t:", total_working_time, ",n_test:", number_of_tests, ",best acc:", max(best_sets), "talk_counter:", talk_counter, ",talk_avg:", round(sum(talk_avgs) / float(len(talk_avgs)),2)
def worker_classification(id, work_list, data, features, n_train, n_test, data_pipe, com_pipe): worker_time = time.time() ##to know how much time did the worker work best_acc = 0 lowest_acc = 0 wl_dict = {} ##to keep all the generated subsets on this slave best_sets = {} last_test_sets = {} ##new test set since last talk com with manager total_wasted = 0 testing = True number_of_tests = 0 comm_rate = 60 counter_time = time.time() talk_counter = 0 talk_avgs = [] talk_time = time.time() #print "started ", id, " with work ", translate_wl(work_list) while (testing): if work_list == []: ##the last test sets will be sent on next talk to the manager if number_of_tests < 30: print(id, " waiting cause ran out of work too soon") time.sleep(10) work_list, best_acc, wasted_time = ask_for_work( id, data_pipe, com_pipe) total_wasted += wasted_time counter = comm_rate #random.randint(50, 150) #Since we need to send the last test sents to manager lets set this to a lower value if work_list == []: ##didnt receive work testing = False break test_subset = work_list[len(work_list) - 1] ##get the subset to test del ( work_list[len(work_list) - 1] ) ##delete the subset from the work list ##according to sources removing from the end oof the list is much faster than removing from the end cannonical_name = ','.join( str(e) for e in test_subset.features) ##add the canonical name to the tested_sets train_set, test_set = utils.cut_dataset( n_train, n_test, data, test_subset.features) ##get the features for the subset acc = classification_svm(train_set, test_set) ##train and test the dataset test_subset.parents_accuracy.append(acc) number_of_tests += 1 last_test_sets[cannonical_name] = True ##add to last sets #debug_data[cannonical_name] = (test_subset.features, test_subset.parents_accuracy) if checkExpand(test_subset, best_acc): new_combs, r_counter = get_combinations( work_list, test_subset, features, wl_dict) ##check for new combinantions work_list = new_combs + work_list ##add new combinations to the list if acc > lowest_acc: ##check if the subset is better then the previous ones if acc > best_acc: ## if acc is the best one found so far best_acc = acc lowest_acc = update_best_sets(best_sets, acc, test_subset) #counter -= 1 #rts.append(time.time() - round_st) if (time.time() - counter_time ) > comm_rate and testing: ## try to talk to manager checked = False work_list, checked, wasted_time, acc = manager_control( id, work_list, last_test_sets, data_pipe, com_pipe, best_acc) wasted_time = 0.0 total_wasted += wasted_time #print "PROCESS ", id , " Number of tests: ", number_of_tests, " in ", round(time.time() - worker_time, 2) if checked: ##worker communicated with manager talk_avgs.append(time.time() - talk_time) talk_counter += 1 #print "PROCESS", id, " WORK SIZE ", len(work_list), "round time: ", rt ," expand count ", expand_count, " best acc:", max(best_sets), " global acc: ", best_acc print("PROCESS ", id, " Number of tests: ", number_of_tests) last_test_sets = { } ##reset the tested list so less information is passed to the manager next time talk_time = time.time() counter_time = time.time() total_working_time = time.time() - worker_time print("PROCESS", id, ",waste_t:", total_wasted, ",work_t:", total_working_time, ",n_test:", number_of_tests, ",best acc:", max(best_sets), "talk_counter:", talk_counter, ",talk_avg:", round(sum(talk_avgs) / float(len(talk_avgs)), 2))