def main(): if __name__ == '__main__': start_t = time.time() print "Using ", n_proc, " processes" ##read data from file #data = prepare_data.import_data() ##detects and removes outliers on data #print "Took ", time.time() - start_t, " to read from obj file ", len(data) #start_t = time.time() data = prepare_data.import_data_csv("dataset.csv", "metadata.csv") #print "Took ", time.time() - start_t, " to read from csv files ", len(data) #quit() data = prepare_data.remove_and_correct_outliers(data) ##normalize data data = prepare_data.normalize_data_range(data) ##at this point all that is prepared ##filter the irrelevant features features = filter_part.filter_features(data) #just to test a smaller number of features fts = [] for i in range(0, 20): fts.append(features[i]) ##call the wrapper part wrapper_part.wrapper(n_proc, data, fts) #features)#fts) print "program took: ", time.time() - start_t
def main(): if __name__ == '__main__': start_t = time.time() print "Using ", n_proc, " processes" ##read data from file #data = prepare_data.import_data() ##detects and removes outliers on data #print "Took ", time.time() - start_t, " to read from obj file ", len(data) #start_t = time.time() data = prepare_data.import_data_csv("dataset.csv", "metadata.csv") #print "Took ", time.time() - start_t, " to read from csv files ", len(data) #quit() data = prepare_data.remove_and_correct_outliers(data) ##normalize data data = prepare_data.normalize_data_range(data) ##at this point all that is prepared ##filter the irrelevant features features = filter_part.filter_features(data) #just to test a smaller number of features fts = [] for i in range(0,20): fts.append(features[i]) ##call the wrapper part wrapper_part.wrapper(n_proc, data, fts)#features)#fts) print "program took: ", time.time() - start_t
def main(): if __name__ == '__main__': start_t = time.time() ##read settings from the xml file settings = read_settings() #quit() print "Using ", settings.number_proc, " processes" ##read data according to xml file settings if settings.dataset_type == "csv": ##dados separados por , sendo a ultima coluna a label data = prepare_data.import_data_csv(settings.file_train[0], "") elif settings.dataset_type == "dense": ##dense type from NIPS data = prepare_data.import_nips_dense(settings.file_train[0], settings.file_train[1]) elif settings.dataset_type == "sparse_binary": ##sparse_binary type from NIPS data = prepare_data.import_nips_sparse_binary( settings.file_train[0], settings.file_train[1], settings.number_features) elif settings.dataset_type == "sparse_integer": ##sparse_integer type from NIPS data = prepare_data.import_nips_sparse_integer( settings.file_train[0], settings.file_train[1], settings.number_features) else: print "Not a valid option for dataset type. Current accepted values: csv, dense, sparse_binary, sparse_integer" quit() print "Read data with size ", len(data), " and ", len( data[0].values), " features." #create_smaller_dataset(data) ##normalize data params_norm = [] data, params_norm = prepare_data.normalize_data_range( data ) ##return the params used for normalization to aplly on future data ##filter the irrelevant features features, used_bins, mi_scores = filter_part.filter_features( data, settings) ##save the used bins to calculate future data print "selected _features:\n", features ##call the wrapper part cost, gamma = wrapper_part.wrapper( data, features, settings) ##returns the used cost and gamma ##wrapper part is over print "program took: ", time.time() - start_t ##each process saves the top 5 subsets to a file f_res = open("res.csv", "r") lines = f_res.readlines() f_res.close() total_nodes = 0 removed_nodes_by_cut = 0 wasted_time = 0.0 send_time = 0.0 times_request_work = 0 results = [] times_work_not_sent = 0 for res in lines: res = res.split(",") if "PROC" in res[0]: ##ignore info lines total_nodes += int(res[2]) removed_nodes_by_cut += int(res[4]) wasted_time += float(res[5]) send_time += float(res[6]) times_request_work = int(res[7]) times_work_not_sent = int(res[8]) continue score = float(res[len(res) - 1]) solution = res[:len(res) - 1] aux_solution = [] for s in solution: ##convert to ints aux_solution.append(int(s)) results.append((aux_solution, score)) #if score > best_score: # best_score = score # best_set = res=[:1] results.sort(key=lambda tup: tup[1]) ##order by score results.reverse() ##Descend ##save the best subsets into a file outfile = open("bestsets.txt", "a") for res in results: outfile.write(str(res[0]) + "," + str(res[1]) + "\n") outfile.close() ##got results, now lets select test the validation part print "Tested a total of: ", total_nodes, "nodes removed by cut mec:", removed_nodes_by_cut print "Wasted time receiving:", wasted_time / float( settings.number_proc), " sending:", send_time / float( settings.number_proc ), " requested work:", times_request_work / float( settings.number_proc ), " times work not sent: ", times_work_not_sent print "Using c and g as parameters: ", cost, gamma print "best set ", results[0], " fts:", len(results[0][0]) #quit() ## The validation consists in selecting the best subset that generalizes the best for unseen data. This only works in case of the nips challenge need to be adapted to different datasets ##mudar isto para testar todos os resultados no validation set e usar o melhr## nips_validation(data, results, mi_scores, params_norm, used_bins, cost, gamma, settings)
def main(): if __name__ == '__main__': start_t = time.time() ##read settings from the xml file settings = read_settings() #quit() print "Using ", settings.number_proc, " processes" ##read data according to xml file settings if settings.dataset_type == "csv": ##dados separados por , sendo a ultima coluna a label data = prepare_data.import_data_csv(settings.file_train[0], "") elif settings.dataset_type == "dense": ##dense type from NIPS data = prepare_data.import_nips_dense(settings.file_train[0], settings.file_train[1]) elif settings.dataset_type == "sparse_binary": ##sparse_binary type from NIPS data = prepare_data.import_nips_sparse_binary(settings.file_train[0], settings.file_train[1], settings.number_features) elif settings.dataset_type == "sparse_integer": ##sparse_integer type from NIPS data = prepare_data.import_nips_sparse_integer(settings.file_train[0], settings.file_train[1], settings.number_features) else: print "Not a valid option for dataset type. Current accepted values: csv, dense, sparse_binary, sparse_integer" quit() print "Read data with size ", len(data), " and ", len(data[0].values), " features." #create_smaller_dataset(data) ##normalize data params_norm = [] data, params_norm = prepare_data.normalize_data_range(data) ##return the params used for normalization to aplly on future data ##filter the irrelevant features features, used_bins, mi_scores = filter_part.filter_features(data, settings) ##save the used bins to calculate future data print "selected _features:\n", features ##call the wrapper part cost, gamma = wrapper_part.wrapper(data, features, settings) ##returns the used cost and gamma ##wrapper part is over print "program took: ", time.time() - start_t ##each process saves the top 5 subsets to a file f_res = open("res.csv", "r") lines = f_res.readlines() f_res.close() total_nodes = 0 removed_nodes_by_cut = 0 wasted_time = 0.0 send_time = 0.0 times_request_work = 0 results = [] times_work_not_sent = 0 for res in lines: res = res.split(",") if "PROC" in res[0]: ##ignore info lines total_nodes += int(res[2]) removed_nodes_by_cut += int(res[4]) wasted_time += float(res[5]) send_time += float(res[6]) times_request_work = int(res[7]) times_work_not_sent = int(res[8]) continue score = float(res[len(res)-1]) solution = res[:len(res)-1] aux_solution = [] for s in solution: ##convert to ints aux_solution.append(int(s)) results.append((aux_solution, score)) #if score > best_score: # best_score = score # best_set = res=[:1] results.sort(key=lambda tup: tup[1]) ##order by score results.reverse() ##Descend ##save the best subsets into a file outfile = open("bestsets.txt", "a") for res in results: outfile.write(str(res[0]) + "," + str(res[1]) + "\n") outfile.close() ##got results, now lets select test the validation part print "Tested a total of: ", total_nodes, "nodes removed by cut mec:", removed_nodes_by_cut print "Wasted time receiving:", wasted_time / float(settings.number_proc), " sending:", send_time/float(settings.number_proc), " requested work:", times_request_work/float(settings.number_proc), " times work not sent: ", times_work_not_sent print "Using c and g as parameters: ", cost, gamma print "best set ", results[0], " fts:", len(results[0][0]) #quit() ## The validation consists in selecting the best subset that generalizes the best for unseen data. This only works in case of the nips challenge need to be adapted to different datasets ##mudar isto para testar todos os resultados no validation set e usar o melhr## nips_validation(data, results, mi_scores, params_norm, used_bins, cost, gamma, settings)