def wrapper(n_proc, data, features): ##division of features for each process, basically this creates the starting work list for every processor print "Using ", len(features), " features" p_work_list = utils.divide_features(n_proc-1, features) ##divide data into train and test set n_train, n_test = utils.divide_data_not_random(data) print "Using ", len(n_train), " instances for train and ", len(n_test), " for test" ##spawn processes data_pipe = mp.Pipe() ##communication pipe to receive work list pipes = [] ## each worker has its own pipe workers = [] ##list to keep track of the workers processes current_work_list = {} ##start workers for i in range(0,n_proc-1): #print "sending process:", i current_work_list[i] = {} pipe = mp.Pipe() p = mp.Process(target=worker_classification, args=(i, p_work_list[i], data, features, n_train, n_test, data_pipe, pipe)) ##, p_work_list[i], data, features )) workers.append(p) pipes.append(pipe) p.start() manage_workers(data_pipe, pipes, workers, current_work_list) ##manage workers return
def wrapper(n_proc, data, features): ##division of features for each process, basically this creates the starting work list for every processor print("Using ", len(features), " features") p_work_list = utils.divide_features(n_proc - 1, features) ##divide data into train and test set n_train, n_test = utils.divide_data_not_random(data) print("Using ", len(n_train), " instances for train and ", len(n_test), " for test") ##spawn processes data_pipe = mp.Pipe() ##communication pipe to receive work list pipes = [] ## each worker has its own pipe workers = [] ##list to keep track of the workers processes current_work_list = {} ##start workers for i in range(0, n_proc - 1): #print "sending process:", i current_work_list[i] = {} pipe = mp.Pipe() p = mp.Process(target=worker_classification, args=(i, p_work_list[i], data, features, n_train, n_test, data_pipe, pipe)) ##, p_work_list[i], data, features )) workers.append(p) pipes.append(pipe) p.start() manage_workers(data_pipe, pipes, workers, current_work_list) ##manage workers return
def wrapper(data, features, settings): print "Starting Wrapper Part With", len(features), " features" n_proc = settings.number_proc ##divide features among processors p_work_list = utils.divide_features(n_proc, features, True) ##get the division of data into train and test set, cross_validation techniques are used folds = generate_folds(data, settings.cross_validation) ##define if we need to estimate second parameter depending on the used kernel if settings.svm_kernel == "linear": ##if we are using a estimate_g = False else: estimate_g = True ##parameter estimation in case paramenters were not provided if settings.svm_parameters == []: print "Starting grid search with ", settings.grid_tests cost, gamma = classification_part.parallel_grid_search(settings, folds, data, features, estimate_g) else: cost = float(settings.svm_parameters[0]) if estimate_g: gamma = float(settings.svm_parameters[1]) else: gamma = 0.0 print "Using ", cost, gamma, " as parameters for SVM" probs = options_wrapper.generate_probabilities(settings, features, data, folds, p_work_list, cost, gamma) print "probabilities vector:", probs manager = mp.Manager() ##manager creates a new process to handle variables, may not be the best option ##setup for paralelization share_lock = mp.Lock() ##create lock to update score_lock = mp.Lock() work_lock = mp.Lock() global_info = manager.Namespace() ##manager for variables global_info.best_score = 0.0 ##saves the best score found so far global_info.needing_work = [] ##saves the processes that currently need work global_info.chosen_to_send = -1 ## saves the selected process that will send the work #global_info.last_update = 0.0 ##timestamp of the last change of best_score comb_memory = manager.dict() pipes = [] ##each process needs their own pipe to reveice work work_size = [] ##to save the amount of work each process receives for i in range(n_proc): pipes.append(mp.Pipe()) work_size.append(50) ##initiate with some work so process don't finish when not receiving work in the beggining global_info.work_size = work_size ## saves the amount of work each process has ##spawn processes workers = [] ##to keep track of the workers for i in range(1,n_proc): p = mp.Process(target=worker_classification, args=(i, p_work_list[i], comb_memory, data, features, folds, settings, pipes, share_lock, global_info, probs, cost, gamma, score_lock, work_lock)) workers.append(p) p.start() ##send main process to work worker_classification(0, p_work_list[0], comb_memory, data, features, folds, settings, pipes, share_lock, global_info, probs, cost, gamma, score_lock, work_lock) ##finish all workers for w in workers: w.join() return cost, gamma##result is read from file no just return to main funciton
def filter_features(data, settings): n_fts = len(data[0].values) print "Dataset has ", n_fts, " features, processing filter selection in parallel" #com = mp.Pipe() ##one pipe is not enough for huge datasets, must change the implementation n_proc = settings.number_proc pipes = [] for n in range(n_proc): pipes.append(mp.Pipe()) shared_lock = mp.Lock() features_division = utils.divide_features( n_proc, range(n_fts), False ) ##Divide features among processes, False means to return actual number of feature instead of classes ##parallelize work workers = [] for i in range(1, n_proc): p = mp.Process(target=f_features, args=(i, features_division[i], data, shared_lock, pipes[i])) workers.append(p) p.start() f_features(0, features_division[0], data, shared_lock, pipes[0]) for w in workers: ##hopefully this waits for every process to finish... w.join() filter_scores = [] for p in pipes: output, input = p aux = output.recv() filter_scores += aux if len( filter_scores ) != n_fts: ##either we received the score of every feature or some error happened print "Didn't receive score for every features. ERROR" quit() filter_scores = sorted(filter_scores, key=lambda x: x[0]) ##order results by ft used_bins = [] for f in filter_scores: used_bins.append(f[1]) ##transform data into discritized_data, if you want to discritize all data for the wrapper part uncomment this. ##From my tests, it improved the wrapper ability to find a subset with high accuracy, however discretizing unseen that provided bad results. #data = discritize_data(data, used_bins) ##order scores by MI score descendint sort_by_mi = sorted(filter_scores, key=lambda x: x[3]) sort_by_mi.reverse() #select features #define threshold for the mi based on the top mi score top_mi = sort_by_mi[0][3] threshold = round(top_mi - (top_mi * settings.percentage_filter), 8) fts = [] for s in sort_by_mi: if s[3] >= threshold: fts.append(s[0]) #for i in range(0,26): # fts.append(sort_by_mi[i][0]) return fts, used_bins, sort_by_mi
def filter_features(data, settings): n_fts = len(data[0].values) print "Dataset has ", n_fts, " features, processing filter selection in parallel" #com = mp.Pipe() ##one pipe is not enough for huge datasets, must change the implementation n_proc = settings.number_proc pipes = [] for n in range(n_proc): pipes.append(mp.Pipe()) shared_lock = mp.Lock() features_division = utils.divide_features(n_proc, range(n_fts), False) ##Divide features among processes, False means to return actual number of feature instead of classes ##parallelize work workers = [] for i in range(1, n_proc): p = mp.Process(target=f_features, args=(i, features_division[i], data, shared_lock, pipes[i])) workers.append(p) p.start() f_features(0, features_division[0], data, shared_lock, pipes[0]) for w in workers: ##hopefully this waits for every process to finish... w.join() filter_scores = [] for p in pipes: output, input = p aux = output.recv() filter_scores += aux if len(filter_scores) != n_fts: ##either we received the score of every feature or some error happened print "Didn't receive score for every features. ERROR" quit() filter_scores = sorted(filter_scores, key=lambda x:x[0]) ##order results by ft used_bins = [] for f in filter_scores: used_bins.append(f[1]) ##transform data into discritized_data, if you want to discritize all data for the wrapper part uncomment this. ##From my tests, it improved the wrapper ability to find a subset with high accuracy, however discretizing unseen that provided bad results. #data = discritize_data(data, used_bins) ##order scores by MI score descendint sort_by_mi = sorted(filter_scores, key=lambda x:x[3]) sort_by_mi.reverse() #select features #define threshold for the mi based on the top mi score top_mi = sort_by_mi[0][3] threshold = round(top_mi - (top_mi * settings.percentage_filter), 8) fts = [] for s in sort_by_mi: if s[3] >= threshold: fts.append(s[0]) #for i in range(0,26): # fts.append(sort_by_mi[i][0]) return fts, used_bins, sort_by_mi
def wrapper(data, features, settings): print "Starting Wrapper Part With", len(features), " features" n_proc = settings.number_proc ##divide features among processors p_work_list = utils.divide_features(n_proc, features, True) ##get the division of data into train and test set, cross_validation techniques are used folds = generate_folds(data, settings.cross_validation) ##define if we need to estimate second parameter depending on the used kernel if settings.svm_kernel == "linear": ##if we are using a estimate_g = False else: estimate_g = True ##parameter estimation in case paramenters were not provided if settings.svm_parameters == []: print "Starting grid search with ", settings.grid_tests cost, gamma = classification_part.parallel_grid_search( settings, folds, data, features, estimate_g) else: cost = float(settings.svm_parameters[0]) if estimate_g: gamma = float(settings.svm_parameters[1]) else: gamma = 0.0 print "Using ", cost, gamma, " as parameters for SVM" probs = options_wrapper.generate_probabilities(settings, features, data, folds, p_work_list, cost, gamma) print "probabilities vector:", probs manager = mp.Manager( ) ##manager creates a new process to handle variables, may not be the best option ##setup for paralelization share_lock = mp.Lock() ##create lock to update score_lock = mp.Lock() work_lock = mp.Lock() global_info = manager.Namespace() ##manager for variables global_info.best_score = 0.0 ##saves the best score found so far global_info.needing_work = [ ] ##saves the processes that currently need work global_info.chosen_to_send = -1 ## saves the selected process that will send the work #global_info.last_update = 0.0 ##timestamp of the last change of best_score comb_memory = manager.dict() pipes = [] ##each process needs their own pipe to reveice work work_size = [] ##to save the amount of work each process receives for i in range(n_proc): pipes.append(mp.Pipe()) work_size.append( 50 ) ##initiate with some work so process don't finish when not receiving work in the beggining global_info.work_size = work_size ## saves the amount of work each process has ##spawn processes workers = [] ##to keep track of the workers for i in range(1, n_proc): p = mp.Process(target=worker_classification, args=(i, p_work_list[i], comb_memory, data, features, folds, settings, pipes, share_lock, global_info, probs, cost, gamma, score_lock, work_lock)) workers.append(p) p.start() ##send main process to work worker_classification(0, p_work_list[0], comb_memory, data, features, folds, settings, pipes, share_lock, global_info, probs, cost, gamma, score_lock, work_lock) ##finish all workers for w in workers: w.join() return cost, gamma ##result is read from file no just return to main funciton