Python divide_featuresの例、utils.divide_features Pythonの例

コード例 #1

0

ファイルを表示

ファイル: wrapper_part.py プロジェクト: JSilva90/MITWS

def wrapper(n_proc, data, features):
    ##division of features for each process, basically this creates the starting work list for every processor
    print "Using ", len(features), " features"
    p_work_list = utils.divide_features(n_proc-1, features)
    ##divide data into train and test set
    n_train, n_test = utils.divide_data_not_random(data)
    
    print "Using ", len(n_train), " instances for train and ", len(n_test), " for test"
    ##spawn processes
    
    data_pipe = mp.Pipe() ##communication pipe to receive work list
    pipes = [] ## each worker has its own pipe   
    workers = []     ##list to keep track of the workers processes
    current_work_list = {}
    ##start workers 
    for i in range(0,n_proc-1):
        #print "sending process:", i
        current_work_list[i] = {}
        pipe = mp.Pipe()
        p = mp.Process(target=worker_classification, args=(i, p_work_list[i], data, features, n_train, n_test, data_pipe, pipe)) ##, p_work_list[i], data, features ))
        workers.append(p)
        pipes.append(pipe)
        p.start()
    
    manage_workers(data_pipe, pipes, workers, current_work_list) ##manage workers
    return

コード例 #2

0

ファイルを表示

def wrapper(n_proc, data, features):
    ##division of features for each process, basically this creates the starting work list for every processor
    print("Using ", len(features), " features")
    p_work_list = utils.divide_features(n_proc - 1, features)
    ##divide data into train and test set
    n_train, n_test = utils.divide_data_not_random(data)

    print("Using ", len(n_train), " instances for train and ", len(n_test),
          " for test")
    ##spawn processes

    data_pipe = mp.Pipe()  ##communication pipe to receive work list
    pipes = []  ## each worker has its own pipe
    workers = []  ##list to keep track of the workers processes
    current_work_list = {}
    ##start workers
    for i in range(0, n_proc - 1):
        #print "sending process:", i
        current_work_list[i] = {}
        pipe = mp.Pipe()
        p = mp.Process(target=worker_classification,
                       args=(i, p_work_list[i], data, features, n_train,
                             n_test, data_pipe,
                             pipe))  ##, p_work_list[i], data, features ))
        workers.append(p)
        pipes.append(pipe)
        p.start()

    manage_workers(data_pipe, pipes, workers,
                   current_work_list)  ##manage workers
    return

コード例 #3

0

ファイルを表示

ファイル: wrapper_part.py プロジェクト: JSilva90/MITWS

def wrapper(data, features, settings):
    print "Starting Wrapper Part With", len(features), " features"
    
    n_proc = settings.number_proc
    ##divide features among processors
    p_work_list = utils.divide_features(n_proc, features, True)
    
    ##get the division of data into train and test set, cross_validation techniques are used
    folds = generate_folds(data, settings.cross_validation)
    
    ##define if we need to estimate second parameter depending on the used kernel
    if settings.svm_kernel == "linear": ##if we are using a
        estimate_g = False
    else:
        estimate_g = True
        
    ##parameter estimation in case paramenters were not provided
    if settings.svm_parameters == []:
        print "Starting grid search with ", settings.grid_tests
        cost, gamma = classification_part.parallel_grid_search(settings, folds, data, features, estimate_g)
    else:
        cost = float(settings.svm_parameters[0])
        if estimate_g:
            gamma = float(settings.svm_parameters[1])
        else:
            gamma = 0.0
        
    print "Using ", cost, gamma, " as parameters for SVM"
    
    
    probs = options_wrapper.generate_probabilities(settings, features, data, folds, p_work_list, cost, gamma)
    
    print "probabilities vector:", probs

    manager = mp.Manager() ##manager creates a new process to handle variables, may not be the best option  
    
    ##setup for paralelization
    share_lock = mp.Lock() ##create lock to update
    score_lock = mp.Lock()
    work_lock = mp.Lock()
    global_info = manager.Namespace() ##manager for variables
    global_info.best_score = 0.0 ##saves the best score found so far
    global_info.needing_work = []  ##saves the processes that currently need work
    global_info.chosen_to_send = -1  ## saves the selected process that will send the work
    #global_info.last_update = 0.0 ##timestamp of the last change of best_score
    comb_memory = manager.dict()
    
    pipes = []  ##each process needs their own pipe to reveice work
    work_size = [] ##to save the amount of work each process receives
    for i in range(n_proc):
        pipes.append(mp.Pipe())
        work_size.append(50) ##initiate with some work so process don't finish when not receiving work in the beggining
    global_info.work_size = work_size ## saves the amount of work each process has
    
    ##spawn processes
    workers = []  ##to keep track of the workers
    for i in range(1,n_proc):
        p = mp.Process(target=worker_classification, args=(i, p_work_list[i], comb_memory, data, features, folds, settings, pipes, share_lock, global_info, probs, cost, gamma, score_lock, work_lock))
        workers.append(p)
        p.start()
    ##send main process to work
    worker_classification(0, p_work_list[0], comb_memory, data, features, folds, settings, pipes, share_lock, global_info, probs, cost, gamma, score_lock, work_lock)
    ##finish all workers
    for w in workers:
        w.join()
        
    return cost, gamma##result is read from file no just return to main funciton

コード例 #4

0

ファイルを表示

ファイル: filter_part.py プロジェクト: world4jason/MITWS

def filter_features(data, settings):
    n_fts = len(data[0].values)
    print "Dataset has ", n_fts, " features, processing filter selection in parallel"
    #com = mp.Pipe() ##one pipe is not enough for huge datasets, must change the implementation

    n_proc = settings.number_proc
    pipes = []
    for n in range(n_proc):
        pipes.append(mp.Pipe())

    shared_lock = mp.Lock()

    features_division = utils.divide_features(
        n_proc, range(n_fts), False
    )  ##Divide features among processes, False means to return actual number of feature instead of classes

    ##parallelize work
    workers = []
    for i in range(1, n_proc):
        p = mp.Process(target=f_features,
                       args=(i, features_division[i], data, shared_lock,
                             pipes[i]))
        workers.append(p)
        p.start()

    f_features(0, features_division[0], data, shared_lock, pipes[0])

    for w in workers:  ##hopefully this waits for every process to finish...
        w.join()

    filter_scores = []
    for p in pipes:
        output, input = p
        aux = output.recv()
        filter_scores += aux

    if len(
            filter_scores
    ) != n_fts:  ##either we received the score of every feature or some error happened
        print "Didn't receive score for every features. ERROR"
        quit()

    filter_scores = sorted(filter_scores,
                           key=lambda x: x[0])  ##order results by ft

    used_bins = []
    for f in filter_scores:
        used_bins.append(f[1])

    ##transform data into discritized_data, if you want to discritize all data for the wrapper part uncomment this.
    ##From my tests, it improved the wrapper ability to find a subset with high accuracy, however discretizing unseen that provided bad results.
    #data = discritize_data(data, used_bins)

    ##order scores by MI score descendint
    sort_by_mi = sorted(filter_scores, key=lambda x: x[3])
    sort_by_mi.reverse()

    #select features
    #define threshold for the mi based on the top mi score
    top_mi = sort_by_mi[0][3]
    threshold = round(top_mi - (top_mi * settings.percentage_filter), 8)

    fts = []
    for s in sort_by_mi:
        if s[3] >= threshold:
            fts.append(s[0])
    #for i in range(0,26):
    #   fts.append(sort_by_mi[i][0])
    return fts, used_bins, sort_by_mi

コード例 #5

0

ファイルを表示

ファイル: filter_part.py プロジェクト: JSilva90/MITWS

def filter_features(data, settings):
    n_fts = len(data[0].values)
    print "Dataset has ", n_fts, " features, processing filter selection in parallel"
    #com = mp.Pipe() ##one pipe is not enough for huge datasets, must change the implementation
    
    n_proc = settings.number_proc
    pipes = []
    for n in range(n_proc):
        pipes.append(mp.Pipe())
    
    shared_lock = mp.Lock()
    
    features_division = utils.divide_features(n_proc, range(n_fts), False) ##Divide features among processes, False means to return actual number of feature instead of classes
    
    ##parallelize work
    workers = []
    for i in range(1, n_proc):
        p = mp.Process(target=f_features, args=(i, features_division[i], data, shared_lock, pipes[i]))
        workers.append(p)
        p.start()
    
    f_features(0, features_division[0], data, shared_lock, pipes[0])
    
    for w in workers: ##hopefully this waits for every process to finish...
        w.join()
    
    filter_scores = []
    for p in pipes:
        output, input = p
        aux = output.recv()
        filter_scores += aux
    
    if len(filter_scores) != n_fts: ##either we received the score of every feature or some error happened
        print "Didn't receive score for every features. ERROR"
        quit()
        
    filter_scores = sorted(filter_scores, key=lambda x:x[0])  ##order results by ft
    
    used_bins = []
    for f in filter_scores:
        used_bins.append(f[1])
    
    
    ##transform data into discritized_data, if you want to discritize all data for the wrapper part uncomment this.
    ##From my tests, it improved the wrapper ability to find a subset with high accuracy, however discretizing unseen that provided bad results.
    #data = discritize_data(data, used_bins)
    
    ##order scores by MI score descendint
    sort_by_mi = sorted(filter_scores, key=lambda x:x[3])
    sort_by_mi.reverse()
    
    #select features
    #define threshold for the mi based on the top mi score
    top_mi = sort_by_mi[0][3]
    threshold = round(top_mi - (top_mi * settings.percentage_filter), 8)
    
    fts = []
    for s in sort_by_mi:
        if s[3] >= threshold:
            fts.append(s[0])
    #for i in range(0,26):
     #   fts.append(sort_by_mi[i][0])
    return fts, used_bins, sort_by_mi

コード例 #6

0

ファイルを表示

def wrapper(data, features, settings):
    print "Starting Wrapper Part With", len(features), " features"

    n_proc = settings.number_proc
    ##divide features among processors
    p_work_list = utils.divide_features(n_proc, features, True)

    ##get the division of data into train and test set, cross_validation techniques are used
    folds = generate_folds(data, settings.cross_validation)

    ##define if we need to estimate second parameter depending on the used kernel
    if settings.svm_kernel == "linear":  ##if we are using a
        estimate_g = False
    else:
        estimate_g = True

    ##parameter estimation in case paramenters were not provided
    if settings.svm_parameters == []:
        print "Starting grid search with ", settings.grid_tests
        cost, gamma = classification_part.parallel_grid_search(
            settings, folds, data, features, estimate_g)
    else:
        cost = float(settings.svm_parameters[0])
        if estimate_g:
            gamma = float(settings.svm_parameters[1])
        else:
            gamma = 0.0

    print "Using ", cost, gamma, " as parameters for SVM"

    probs = options_wrapper.generate_probabilities(settings, features, data,
                                                   folds, p_work_list, cost,
                                                   gamma)

    print "probabilities vector:", probs

    manager = mp.Manager(
    )  ##manager creates a new process to handle variables, may not be the best option

    ##setup for paralelization
    share_lock = mp.Lock()  ##create lock to update
    score_lock = mp.Lock()
    work_lock = mp.Lock()
    global_info = manager.Namespace()  ##manager for variables
    global_info.best_score = 0.0  ##saves the best score found so far
    global_info.needing_work = [
    ]  ##saves the processes that currently need work
    global_info.chosen_to_send = -1  ## saves the selected process that will send the work
    #global_info.last_update = 0.0 ##timestamp of the last change of best_score
    comb_memory = manager.dict()

    pipes = []  ##each process needs their own pipe to reveice work
    work_size = []  ##to save the amount of work each process receives
    for i in range(n_proc):
        pipes.append(mp.Pipe())
        work_size.append(
            50
        )  ##initiate with some work so process don't finish when not receiving work in the beggining
    global_info.work_size = work_size  ## saves the amount of work each process has

    ##spawn processes
    workers = []  ##to keep track of the workers
    for i in range(1, n_proc):
        p = mp.Process(target=worker_classification,
                       args=(i, p_work_list[i], comb_memory, data, features,
                             folds, settings, pipes, share_lock, global_info,
                             probs, cost, gamma, score_lock, work_lock))
        workers.append(p)
        p.start()
    ##send main process to work
    worker_classification(0, p_work_list[0], comb_memory, data, features,
                          folds, settings, pipes, share_lock, global_info,
                          probs, cost, gamma, score_lock, work_lock)
    ##finish all workers
    for w in workers:
        w.join()

    return cost, gamma  ##result is read from file no just return to main funciton