Python cut_datasetの例、utils.cut_dataset Pythonの例

コード例 #1

0

ファイルを表示

ファイル: classification_part.py プロジェクト: JSilva90/MITWS

def classify_final_test(data, features, data_to_classify, cost, g, svm_kernel):
    ##make all data becoming part of the training
    train_set, _ = utils.cut_dataset(range(len(data)), [], data, features)
    _, test_set = utils.cut_dataset([], range(len(data_to_classify)), data_to_classify, features)
    
    if svm_kernel == "linear":
        clf = svm.LinearSVC(C=cost) #used for gissete and madelon
    else:
        clf = svm.SVC(C=cost, gamma=g, kernel='rbf')  
    #
    clf.fit(train_set.fts_values, train_set.fts_pred)
    predictions = clf.predict(test_set.fts_values)
    preds = []
    for i in predictions:
        #print i
        preds.append(int(i))
    return preds

コード例 #2

0

ファイルを表示

ファイル: classification_part.py プロジェクト: world4jason/MITWS

def classify_final_test(data, features, data_to_classify, cost, g, svm_kernel):
    ##make all data becoming part of the training
    train_set, _ = utils.cut_dataset(range(len(data)), [], data, features)
    _, test_set = utils.cut_dataset([], range(len(data_to_classify)),
                                    data_to_classify, features)

    if svm_kernel == "linear":
        clf = svm.LinearSVC(C=cost)  #used for gissete and madelon
    else:
        clf = svm.SVC(C=cost, gamma=g, kernel='rbf')
    #
    clf.fit(train_set.fts_values, train_set.fts_pred)
    predictions = clf.predict(test_set.fts_values)
    preds = []
    for i in predictions:
        #print i
        preds.append(int(i))
    return preds

コード例 #3

0

ファイルを表示

ファイル: classification_part.py プロジェクト: JSilva90/MITWS

def classify(folds, data, features, cost, gamma, svm_kernel):
    ##train_set, test_set = utils.cut_dataset(n_train, n_test, data, test_subset.features) ##get the features for the subset
    scores = []
    for fold in folds: ##run a classification for each fold
        train, test = fold ##division of data on that fold
        train_set, test_set = utils.cut_dataset(train, test, data, features)
        ##print train_set.fts_pred, test_set.fts_pred
        score = classification_svm(train_set, test_set, cost, gamma, svm_kernel)
        if math.isnan(score):
            #print "nan score"
            score = 0
        scores.append(score)
    
     #print scores
    final_score = sum(scores) / float(len(scores))
    return final_score

コード例 #4

0

ファイルを表示

ファイル: classification_part.py プロジェクト: world4jason/MITWS

def classify(folds, data, features, cost, gamma, svm_kernel):
    ##train_set, test_set = utils.cut_dataset(n_train, n_test, data, test_subset.features) ##get the features for the subset
    scores = []
    for fold in folds:  ##run a classification for each fold
        train, test = fold  ##division of data on that fold
        train_set, test_set = utils.cut_dataset(train, test, data, features)
        ##print train_set.fts_pred, test_set.fts_pred
        score = classification_svm(train_set, test_set, cost, gamma,
                                   svm_kernel)
        if math.isnan(score):
            #print "nan score"
            score = 0
        scores.append(score)

    #print scores
    final_score = sum(scores) / float(len(scores))
    return final_score

コード例 #5

0

ファイルを表示

ファイル: wrapper_part.py プロジェクト: JSilva90/MITWS

def worker_classification(id, work_list, data, features, n_train, n_test, data_pipe, com_pipe):
    worker_time = time.time() ##to know how much time did the worker work
    best_acc = 0 
    lowest_acc = 0
    wl_dict = {}  ##to keep all the generated subsets on this slave
    best_sets = {}
    last_test_sets = {} ##new test set since last talk com with manager
    total_wasted = 0
    testing = True
    number_of_tests = 0
    comm_rate = 60
    counter_time = time.time()
    
    talk_counter = 0
    talk_avgs = []
    talk_time = time.time()
    #print "started ", id, " with work ", translate_wl(work_list)
    while (testing):        
        if work_list ==  []:
            ##the last test sets will be sent on next talk to the manager
            if number_of_tests < 30:
                print id, " waiting cause ran out of work too soon" 
                time.sleep(10)
            work_list, best_acc, wasted_time = ask_for_work(id, data_pipe, com_pipe)
            total_wasted += wasted_time
            counter = comm_rate#random.randint(50, 150) #Since we need to send the last test sents to manager lets set this to a lower value
            if work_list == []: ##didnt receive work
                testing = False
                break
        
        test_subset = work_list[len(work_list)-1]  ##get the subset to test
        del(work_list[len(work_list)-1]) ##delete the subset from the work list ##according to sources removing from the end oof the list is much faster than removing from the end
        cannonical_name = ','.join(str(e) for e in test_subset.features) ##add the canonical name to the tested_sets
        
        train_set, test_set = utils.cut_dataset(n_train, n_test, data, test_subset.features) ##get the features for the subset
        acc = classification_svm(train_set, test_set) ##train and test the dataset
        test_subset.parents_accuracy.append(acc)
        number_of_tests += 1 
        last_test_sets[cannonical_name] = True ##add to last sets
        
        #debug_data[cannonical_name] = (test_subset.features, test_subset.parents_accuracy)
        if checkExpand(test_subset, best_acc): 
            new_combs, r_counter = get_combinations(work_list, test_subset, features, wl_dict) ##check for new combinantions
            work_list = new_combs + work_list  ##add new combinations to the list

        if acc > lowest_acc:  ##check if the subset is better then the previous ones
            if acc > best_acc: ## if acc is the best one found so far
                best_acc = acc
            lowest_acc = update_best_sets(best_sets, acc, test_subset)
        #counter -= 1
        
        #rts.append(time.time() - round_st)
        if (time.time() - counter_time) > comm_rate and testing: ## try to talk to manager
            checked = False;
            work_list, checked, wasted_time, acc = manager_control(id, work_list, last_test_sets, data_pipe, com_pipe, best_acc)
            wasted_time = 0.0
            total_wasted += wasted_time
            #print "PROCESS ", id , " Number of tests: ", number_of_tests, " in ", round(time.time() - worker_time, 2)
                
            if checked: ##worker communicated with manager
                talk_avgs.append(time.time() - talk_time)
                talk_counter += 1
                #print "PROCESS", id, " WORK SIZE ", len(work_list), "round time: ", rt ," expand count ", expand_count, " best acc:", max(best_sets), " global acc: ", best_acc
                print "PROCESS ", id , " Number of tests: ", number_of_tests
                last_test_sets = {} ##reset the tested list so less information is passed to the manager next time
                talk_time = time.time()
                counter_time = time.time()
     
            
    total_working_time = time.time() - worker_time
    print "PROCESS", id, ",waste_t:", total_wasted, ",work_t:", total_working_time, ",n_test:", number_of_tests, ",best acc:", max(best_sets), "talk_counter:", talk_counter, ",talk_avg:", round(sum(talk_avgs) / float(len(talk_avgs)),2)

コード例 #6

0

ファイルを表示

def worker_classification(id, work_list, data, features, n_train, n_test,
                          data_pipe, com_pipe):
    worker_time = time.time()  ##to know how much time did the worker work
    best_acc = 0
    lowest_acc = 0
    wl_dict = {}  ##to keep all the generated subsets on this slave
    best_sets = {}
    last_test_sets = {}  ##new test set since last talk com with manager
    total_wasted = 0
    testing = True
    number_of_tests = 0
    comm_rate = 60
    counter_time = time.time()

    talk_counter = 0
    talk_avgs = []
    talk_time = time.time()
    #print "started ", id, " with work ", translate_wl(work_list)
    while (testing):
        if work_list == []:
            ##the last test sets will be sent on next talk to the manager
            if number_of_tests < 30:
                print(id, " waiting cause ran out of work too soon")
                time.sleep(10)
            work_list, best_acc, wasted_time = ask_for_work(
                id, data_pipe, com_pipe)
            total_wasted += wasted_time
            counter = comm_rate  #random.randint(50, 150) #Since we need to send the last test sents to manager lets set this to a lower value
            if work_list == []:  ##didnt receive work
                testing = False
                break

        test_subset = work_list[len(work_list) - 1]  ##get the subset to test
        del (
            work_list[len(work_list) - 1]
        )  ##delete the subset from the work list ##according to sources removing from the end oof the list is much faster than removing from the end
        cannonical_name = ','.join(
            str(e) for e in
            test_subset.features)  ##add the canonical name to the tested_sets

        train_set, test_set = utils.cut_dataset(
            n_train, n_test, data,
            test_subset.features)  ##get the features for the subset
        acc = classification_svm(train_set,
                                 test_set)  ##train and test the dataset
        test_subset.parents_accuracy.append(acc)
        number_of_tests += 1
        last_test_sets[cannonical_name] = True  ##add to last sets

        #debug_data[cannonical_name] = (test_subset.features, test_subset.parents_accuracy)
        if checkExpand(test_subset, best_acc):
            new_combs, r_counter = get_combinations(
                work_list, test_subset, features,
                wl_dict)  ##check for new combinantions
            work_list = new_combs + work_list  ##add new combinations to the list

        if acc > lowest_acc:  ##check if the subset is better then the previous ones
            if acc > best_acc:  ## if acc is the best one found so far
                best_acc = acc
            lowest_acc = update_best_sets(best_sets, acc, test_subset)
        #counter -= 1

        #rts.append(time.time() - round_st)
        if (time.time() - counter_time
            ) > comm_rate and testing:  ## try to talk to manager
            checked = False
            work_list, checked, wasted_time, acc = manager_control(
                id, work_list, last_test_sets, data_pipe, com_pipe, best_acc)
            wasted_time = 0.0
            total_wasted += wasted_time
            #print "PROCESS ", id , " Number of tests: ", number_of_tests, " in ", round(time.time() - worker_time, 2)

            if checked:  ##worker communicated with manager
                talk_avgs.append(time.time() - talk_time)
                talk_counter += 1
                #print "PROCESS", id, " WORK SIZE ", len(work_list), "round time: ", rt ," expand count ", expand_count, " best acc:", max(best_sets), " global acc: ", best_acc
                print("PROCESS ", id, " Number of tests: ", number_of_tests)
                last_test_sets = {
                }  ##reset the tested list so less information is passed to the manager next time
                talk_time = time.time()
                counter_time = time.time()

    total_working_time = time.time() - worker_time
    print("PROCESS", id, ",waste_t:", total_wasted, ",work_t:",
          total_working_time, ",n_test:", number_of_tests, ",best acc:",
          max(best_sets), "talk_counter:", talk_counter, ",talk_avg:",
          round(sum(talk_avgs) / float(len(talk_avgs)), 2))