Ejemplo n.º 1
0
def main():
    if __name__ == '__main__':
        start_t = time.time()

        print "Using ", n_proc, " processes"
        ##read data from file
        #data = prepare_data.import_data()
        ##detects and removes outliers on data

        #print "Took ", time.time() - start_t, " to read from obj file ", len(data)

        #start_t = time.time()
        data = prepare_data.import_data_csv("dataset.csv", "metadata.csv")

        #print "Took ", time.time() - start_t, " to read from csv files ", len(data)

        #quit()
        data = prepare_data.remove_and_correct_outliers(data)
        ##normalize data
        data = prepare_data.normalize_data_range(data)

        ##at this point all that is prepared

        ##filter the irrelevant features
        features = filter_part.filter_features(data)

        #just to test a smaller number of features
        fts = []
        for i in range(0, 20):
            fts.append(features[i])

        ##call the wrapper part
        wrapper_part.wrapper(n_proc, data, fts)  #features)#fts)

        print "program took: ", time.time() - start_t
Ejemplo n.º 2
0
def main():
    if __name__ == '__main__':
        start_t = time.time()
        
        print "Using ", n_proc, " processes"
        ##read data from file
        #data = prepare_data.import_data()
        ##detects and removes outliers on data
        
        #print "Took ", time.time() - start_t, " to read from obj file ", len(data)
        
        #start_t = time.time()
        data = prepare_data.import_data_csv("dataset.csv", "metadata.csv")
        
        #print "Took ", time.time() - start_t, " to read from csv files ", len(data)
        
        
        #quit()
        data = prepare_data.remove_and_correct_outliers(data)
        ##normalize data
        data = prepare_data.normalize_data_range(data)
        
        ##at this point all that is prepared
    
        ##filter the irrelevant features
        features = filter_part.filter_features(data)
        
        #just to test a smaller number of features
        fts = []
        for i in range(0,20):
            fts.append(features[i])
            
        
        
        ##call the wrapper part
        wrapper_part.wrapper(n_proc, data, fts)#features)#fts)
        
        print "program took: ", time.time() - start_t
Ejemplo n.º 3
0
def main():
    if __name__ == '__main__':
        start_t = time.time()

        ##read settings from the xml file
        settings = read_settings()
        #quit()

        print "Using ", settings.number_proc, " processes"

        ##read data according to xml file settings
        if settings.dataset_type == "csv":  ##dados separados por , sendo a ultima coluna a label
            data = prepare_data.import_data_csv(settings.file_train[0], "")
        elif settings.dataset_type == "dense":  ##dense type from NIPS
            data = prepare_data.import_nips_dense(settings.file_train[0],
                                                  settings.file_train[1])
        elif settings.dataset_type == "sparse_binary":  ##sparse_binary type from NIPS
            data = prepare_data.import_nips_sparse_binary(
                settings.file_train[0], settings.file_train[1],
                settings.number_features)
        elif settings.dataset_type == "sparse_integer":  ##sparse_integer type from NIPS
            data = prepare_data.import_nips_sparse_integer(
                settings.file_train[0], settings.file_train[1],
                settings.number_features)
        else:
            print "Not a valid option for dataset type. Current accepted values: csv, dense, sparse_binary, sparse_integer"
            quit()

        print "Read data with size ", len(data), " and ", len(
            data[0].values), " features."

        #create_smaller_dataset(data)

        ##normalize data
        params_norm = []
        data, params_norm = prepare_data.normalize_data_range(
            data
        )  ##return the params used for normalization to aplly on future data

        ##filter the irrelevant features
        features, used_bins, mi_scores = filter_part.filter_features(
            data, settings)  ##save the used bins to calculate future data

        print "selected _features:\n", features

        ##call the wrapper part
        cost, gamma = wrapper_part.wrapper(
            data, features, settings)  ##returns the used cost and gamma
        ##wrapper part is over
        print "program took: ", time.time() - start_t

        ##each process saves the top 5 subsets to a file
        f_res = open("res.csv", "r")
        lines = f_res.readlines()
        f_res.close()

        total_nodes = 0
        removed_nodes_by_cut = 0
        wasted_time = 0.0
        send_time = 0.0
        times_request_work = 0
        results = []
        times_work_not_sent = 0

        for res in lines:
            res = res.split(",")
            if "PROC" in res[0]:  ##ignore info lines
                total_nodes += int(res[2])
                removed_nodes_by_cut += int(res[4])
                wasted_time += float(res[5])
                send_time += float(res[6])
                times_request_work = int(res[7])
                times_work_not_sent = int(res[8])
                continue
            score = float(res[len(res) - 1])
            solution = res[:len(res) - 1]
            aux_solution = []
            for s in solution:  ##convert to ints
                aux_solution.append(int(s))
            results.append((aux_solution, score))
            #if score > best_score:
            #   best_score = score
            #  best_set = res=[:1]
        results.sort(key=lambda tup: tup[1])  ##order by score
        results.reverse()  ##Descend

        ##save the best subsets into a file
        outfile = open("bestsets.txt", "a")
        for res in results:
            outfile.write(str(res[0]) + "," + str(res[1]) + "\n")
        outfile.close()

        ##got results, now lets select test the validation part
        print "Tested a total of: ", total_nodes, "nodes removed by cut mec:", removed_nodes_by_cut
        print "Wasted time receiving:", wasted_time / float(
            settings.number_proc), " sending:", send_time / float(
                settings.number_proc
            ), " requested work:", times_request_work / float(
                settings.number_proc
            ), " times work not sent: ", times_work_not_sent
        print "Using c and g as parameters: ", cost, gamma
        print "best set ", results[0], " fts:", len(results[0][0])

        #quit()

        ## The validation consists in selecting the best subset that generalizes the best for unseen data. This only works in case of the nips challenge need to be adapted to different datasets
        ##mudar isto para testar todos os resultados no validation set e usar o melhr##
        nips_validation(data, results, mi_scores, params_norm, used_bins, cost,
                        gamma, settings)
Ejemplo n.º 4
0
def main():
    if __name__ == '__main__':
        start_t = time.time()
        
        ##read settings from the xml file
        settings = read_settings()
        #quit()
        
        print "Using ", settings.number_proc, " processes"
        
        
        ##read data according to xml file settings
        if settings.dataset_type == "csv": ##dados separados por , sendo a ultima coluna a label
            data = prepare_data.import_data_csv(settings.file_train[0], "")
        elif settings.dataset_type == "dense": ##dense type from NIPS
            data = prepare_data.import_nips_dense(settings.file_train[0], settings.file_train[1])
        elif settings.dataset_type == "sparse_binary": ##sparse_binary type from NIPS
            data = prepare_data.import_nips_sparse_binary(settings.file_train[0], settings.file_train[1], settings.number_features)
        elif settings.dataset_type == "sparse_integer": ##sparse_integer type from NIPS
            data = prepare_data.import_nips_sparse_integer(settings.file_train[0], settings.file_train[1], settings.number_features)
        else:
            print "Not a valid option for dataset type. Current accepted values: csv, dense, sparse_binary, sparse_integer"
            quit()
        
        print "Read data with size ", len(data), " and ", len(data[0].values), " features."
        
        
        
        #create_smaller_dataset(data)
        
        ##normalize data
        params_norm = []
        data, params_norm = prepare_data.normalize_data_range(data) ##return the params used for normalization to aplly on future data
        
        ##filter the irrelevant features
        features, used_bins, mi_scores = filter_part.filter_features(data, settings) ##save the used bins to calculate future data
        
        print "selected _features:\n", features 
        
        ##call the wrapper part
        cost, gamma = wrapper_part.wrapper(data, features, settings) ##returns the used cost and gamma
        ##wrapper part is over
        print "program took: ", time.time() - start_t 
        
        ##each process saves the top 5 subsets to a file
        f_res = open("res.csv", "r")
        lines = f_res.readlines()
        f_res.close()
        
        total_nodes = 0
        removed_nodes_by_cut = 0
        wasted_time = 0.0
        send_time = 0.0
        times_request_work = 0
        results = []
        times_work_not_sent = 0
        
        for res in lines:
            res = res.split(",")
            if "PROC" in res[0]: ##ignore info lines
                total_nodes += int(res[2])
                removed_nodes_by_cut += int(res[4])
                wasted_time += float(res[5])
                send_time += float(res[6])
                times_request_work = int(res[7])
                times_work_not_sent = int(res[8])
                continue
            score = float(res[len(res)-1])
            solution = res[:len(res)-1]
            aux_solution = []
            for s in solution: ##convert to ints
                aux_solution.append(int(s))
            results.append((aux_solution, score))
            #if score > best_score:
             #   best_score = score
              #  best_set = res=[:1]
        results.sort(key=lambda tup: tup[1]) ##order by score
        results.reverse() ##Descend
        
        ##save the best subsets into a file
        outfile = open("bestsets.txt", "a")
        for res in results:
            outfile.write(str(res[0]) + "," + str(res[1]) + "\n")
        outfile.close()
        
        
        ##got results, now lets select test the validation part
        print "Tested a total of: ", total_nodes, "nodes removed by cut mec:", removed_nodes_by_cut
        print "Wasted time receiving:", wasted_time / float(settings.number_proc), " sending:", send_time/float(settings.number_proc), " requested work:", times_request_work/float(settings.number_proc), " times work not sent: ", times_work_not_sent
        print "Using c and g as parameters: ", cost, gamma
        print "best set ", results[0], " fts:", len(results[0][0])
        
        #quit()
        
        ## The validation consists in selecting the best subset that generalizes the best for unseen data. This only works in case of the nips challenge need to be adapted to different datasets
        ##mudar isto para testar todos os resultados no validation set e usar o melhr##
        nips_validation(data, results, mi_scores, params_norm, used_bins, cost, gamma, settings)