Ejemplo n.º 1
0
def process_house_votes_data(file_path):
	#print('LOG: process_house_votes_data() START')
	#print(file_path)

	original_data = FileManager.get_csv_file_data_array(file_path)
	#print('LOG: ORIGINAL data')
	#print(original_data[0])

	#print('LOG: Class moved to final column')
	original_col_with_class = 0
	columns_moved_data = DataManipulator.move_column_to_end(original_data, original_col_with_class)
	#print(columns_moved_data[0])

	#print('LOG: group input values into bins')
	data_in_bins = _bin_input_attributes(columns_moved_data)
	#print(data_in_bins[0])

	#print('LOG: turn input features into binary values (0,1)')
	num_bins = 3
	col_idx = 0
	final_data = DataManipulator.expand_attributes_to_binary_values(data_in_bins, col_idx, num_bins)
	for col in data_in_bins[0]:
		col_idx += num_bins #This is because columns are inserted each itr
		#print('length of final_data')
		#print(len(final_data[0]))
		if col_idx == (len(final_data[0]) - 1): #Skip last col b/c it's the class value
			#print('LOG: Stop here - skip the final column which is classification')
			break
		final_data = DataManipulator.expand_attributes_to_binary_values(final_data, col_idx, num_bins)
	#print(final_data[0])
	return final_data
def process_house_votes_data(file_path):
    #print('LOG: process_house_votes_data() START')
    #print(file_path)

    original_data = FileManager.get_csv_file_data_array(file_path)
    #print('LOG: ORIGINAL data')
    #print(original_data[0])

    #print('LOG: group input values into bins')
    num_bins = 4
    #Fix sepal length 4.3 - 7.9
    column = 0
    min_val = 4.3
    max_val = 7.9
    data_in_bins = \
     _bin_input_attribute(original_data, column, min_val, max_val, num_bins)
    #Fix sepal width 2.0 - 4.4
    column = 1
    min_val = 2.0
    max_val = 4.4
    data_in_bins = \
     _bin_input_attribute(original_data, column, min_val, max_val, num_bins)
    #Fix petal length 1.0 - 6.9
    column = 2
    min_val = 1.0
    max_val = 6.9
    data_in_bins = \
     _bin_input_attribute(original_data, column, min_val, max_val, num_bins)
    #Fix petal width 0.1 - 2.5
    column = 3
    min_val = 0.1
    max_val = 2.5
    data_in_bins = \
     _bin_input_attribute(original_data, column, min_val, max_val, num_bins)
    #print()
    #print(data_in_bins)
    #print()

    #print('LOG: turn input features into binary values (0,1)')
    col_idx = 0
    final_data = DataManipulator.expand_attributes_to_binary_values(
        data_in_bins, col_idx, num_bins)
    for col in data_in_bins[0]:
        col_idx += num_bins  #This is because columns are inserted each itr
        #print('length of final_data')
        #print(len(final_data[0]))
        if col_idx == (len(final_data[0]) -
                       1):  #Skip last col b/c it's the class value
            #print('LOG: Stop here - skip the final column which is classification')
            break
        final_data = DataManipulator.expand_attributes_to_binary_values(
            final_data, col_idx, num_bins)

    #print(final_data[0])
    return final_data
Ejemplo n.º 3
0
def process_house_votes_data(file_path):
    #print('LOG: process_house_votes_data() START')
    #print(file_path)

    original_data = FileManager.get_csv_file_data_array(file_path)
    #print('LOG: ORIGINAL data')
    #print(original_data[0])

    #Eliminate first column of data, convert class from 2,4 -> 0,1 && make bins are 0-9, not 1-10
    modified_data = list()
    number_of_rows = len(original_data)
    for vector_idx in range(0, number_of_rows, 1):
        tmplist = original_data[vector_idx][1:]
        number_of_cols = len(tmplist)
        for col_idx in range(0, number_of_cols, 1):
            if col_idx == (number_of_cols - 1):  #classificiaton value {2,4}
                #Classification value, {2,4} -> {0,1}
                if tmplist[col_idx] == 2:
                    tmplist[col_idx] = 0
                elif tmplist[col_idx] == 4:
                    tmplist[col_idx] = 1
                else:
                    print('ERROR: Expected {2,4} but instead got ',
                          tmplist[col_idx])
                    #no modification made
            else:  #normal bin value
                tmplist[col_idx] = tmplist[col_idx] - 1  #bins 1-10 -> 0-9

        modified_data.append(tmplist)

    #print()
    #print(modified_data)
    #print()

    #print('LOG: turn input features into binary values (0,1)')
    num_bins = 10
    col_idx = 0
    final_data = DataManipulator.expand_attributes_to_binary_values(
        modified_data, col_idx, num_bins)
    for col in modified_data[0]:
        col_idx += num_bins  #This is because columns are inserted each itr
        #print('length of final_data')
        #print(len(final_data[0]))
        if col_idx == (len(final_data[0]) -
                       1):  #Skip last col b/c it's the class value
            #print('LOG: Stop here - skip the final column which is classification')
            break
        final_data = DataManipulator.expand_attributes_to_binary_values(
            final_data, col_idx, num_bins)
    #print(final_data[0])
    return final_data
Ejemplo n.º 4
0
def run_model_with_cross_validation(prune=False):

    #GET DATA
    #- expect data_0 ... data_4
    data_groups = list()
    data_groups.append(FileManager.get_csv_file_data_array('data_0'))
    data_groups.append(FileManager.get_csv_file_data_array('data_1'))
    data_groups.append(FileManager.get_csv_file_data_array('data_2'))
    data_groups.append(FileManager.get_csv_file_data_array('data_3'))
    data_groups.append(FileManager.get_csv_file_data_array('data_4'))

    if prune == True:
        validation_data = FileManager.get_csv_file_data_array(
            'validation_data')

    NUM_GROUPS = len(data_groups)

    #For each data_group, train on all others and test on me
    culminating_result = 0
    culminating_validation_result = 0

    finanl_average_result = 0
    final_validation_average_result = 0

    for test_group_id in range(NUM_GROUPS):
        print()
        #Form training data as 4/5 data
        train_data = list()
        for train_group_id in range(len(data_groups)):
            if (train_group_id != test_group_id):
                #Initialize train_data if necessary
                if (len(train_data) == 0):
                    train_data = data_groups[train_group_id]
                else:
                    train_data = train_data + data_groups[train_group_id]

        print('train_data group', str(test_group_id), 'length: ',
              len(train_data))
        #print(train_data)

        test_data = data_groups[test_group_id]

        result = 0
        validation_result = 0
        model = ClassificationTree(train_data)
        model.train()
        print('tree size:', model.get_size_of_tree())
        print('tree: ')
        print(model.print_tree())
        result = model.test(test_data)
        #print('result:', result)
        culminating_result = culminating_result + result
        print('Accuracy (%):', result)
        if prune == True:
            model.validate(validation_data)
            print('tree size w/ pruning:', model.get_size_of_tree())
            print('tree: ')
            print(model.print_tree())
            validation_result = model.test(test_data)
            #print('result:', result)
            culminating_validation_result = culminating_validation_result + validation_result
            print('Accuracy w/ pruning (%):', validation_result)
        print()

    final_average_result = culminating_result / NUM_GROUPS
    final_validation_average_result = culminating_validation_result / NUM_GROUPS
    #print()
    #print('final average result:')
    #print(final_average_result)
    #print()

    return (final_average_result, final_validation_average_result)
def main():
    #print('LOG: Main program to run tests')

    parser = argparse.ArgumentParser(
        description='Learn & Verify machine learning algorithms')
    parser.add_argument('all_class_filepath',
                        type=str,
                        help='full path to input file')
    parser.add_argument('class_0_filepath',
                        type=str,
                        help='full path to input file')
    parser.add_argument('class_1_filepath',
                        type=str,
                        help='full path to input file')
    parser.add_argument('class_2_filepath',
                        type=str,
                        help='full path to input file')
    parser.add_argument('fraction',
                        type=float,
                        default=0.66,
                        nargs='?',
                        help='fraction of data to learn')
    parser.add_argument('num_classes',
                        type=int,
                        default=3,
                        nargs='?',
                        help='number of total classes')
    args = parser.parse_args()

    #INPUTS
    print()
    print('INPUTS')
    allclass_filepath = args.all_class_filepath
    print('overall_model_file_path: ' + allclass_filepath)
    class0_filepath = args.class_0_filepath
    print('class0_filepath: ' + class0_filepath)
    class1_filepath = args.class_1_filepath
    print('class1_filepath: ' + class1_filepath)
    class2_filepath = args.class_2_filepath
    print('class2_filepath: ' + class2_filepath)
    fraction_of_data_for_learning = args.fraction
    print('fraction of data to train: ', fraction_of_data_for_learning)
    number_of_classes = args.num_classes
    print('number of classes: ', number_of_classes)

    #READ INPUT DATA
    allclass_data = FileManager.get_csv_file_data_array(allclass_filepath)
    print('number of input vectors: ', len(allclass_data))
    class0_data = FileManager.get_csv_file_data_array(class0_filepath)
    class1_data = FileManager.get_csv_file_data_array(class1_filepath)
    class2_data = FileManager.get_csv_file_data_array(class2_filepath)

    #SPLIT INPUT DATA (learning & test sets)
    print()
    allclass_data_sets = DataManipulator.split_data_in_2_randomly(
        allclass_data, fraction_of_data_for_learning)
    class0_data_sets = DataManipulator.split_data_in_2_randomly(
        class0_data, fraction_of_data_for_learning)
    class1_data_sets = DataManipulator.split_data_in_2_randomly(
        class1_data, fraction_of_data_for_learning)
    class2_data_sets = DataManipulator.split_data_in_2_randomly(
        class2_data, fraction_of_data_for_learning)

    allclass_learning_data = allclass_data_sets[0]
    print('learning data size: ', len(allclass_learning_data))
    allclass_test_data = allclass_data_sets[1]
    print('test data size: ', len(allclass_test_data))
    print()

    class0_learning_data = class0_data_sets[0]
    class0_test_data = class0_data_sets[1]
    class1_learning_data = class1_data_sets[0]
    class1_test_data = class1_data_sets[1]
    class2_learning_data = class2_data_sets[0]
    class2_test_data = class2_data_sets[1]

    #LEARN THE MODELS

    #Winnow2
    winnow2_0 = Winnow2()
    winnow2_1 = Winnow2()
    winnow2_2 = Winnow2()

    winnow2_0_learned_weights = winnow2_0.learn_winnow2_model(
        class0_learning_data)
    winnow2_1_learned_weights = winnow2_1.learn_winnow2_model(
        class1_learning_data)
    winnow2_2_learned_weights = winnow2_2.learn_winnow2_model(
        class2_learning_data)
    print('Winnow2 learned weights for class 0:')
    print(winnow2_0_learned_weights)
    print()
    print('Winnow2 learned weights for class 1:')
    print(winnow2_1_learned_weights)
    print()
    print('Winnow2 learned weights for class 2:')
    print(winnow2_2_learned_weights)
    print()

    #Naive Bayes
    naive_bayes = NaiveBayes(number_of_classes)
    naive_bayes_learned_percents = naive_bayes.learn_naive_bayes_model(
        allclass_learning_data)
    print(
        'Naive Bayes learned percentages as input[ class[ (prob0, prob1) ] ]')
    print(naive_bayes_learned_percents)
    print()

    #TEST THE MODELS

    #Winnow2
    print('Testing Winnow2 model')
    winnow2_multi_model_test_results = Winnow2.test_multiple_winnow2_models(
        allclass_test_data, [winnow2_0, winnow2_1, winnow2_2])
    print('classification attempts(', winnow2_multi_model_test_results[0],
          '), \
#fails(', winnow2_multi_model_test_results[1], '), \
#success(', winnow2_multi_model_test_results[2], ')')
    print()

    #Naive Bayes
    print('Testing Naive Bayes model')
    naive_bayes_test_results = naive_bayes.test_naive_bayes_model(
        allclass_test_data)
    print('#classification attempts(', naive_bayes_test_results[0], '), \
#fails(', naive_bayes_test_results[1], '), \
#success(', naive_bayes_test_results[2], ')')
    print()
def main():
    #print('LOG: Main program to run tests')

    parser = argparse.ArgumentParser(
        description='Learn & Verify machine learning algorithms')
    parser.add_argument('file_path', type=str, help='full path to input file')
    parser.add_argument('fraction',
                        type=float,
                        default=0.66,
                        nargs='?',
                        help='fraction of data to learn')
    parser.add_argument('num_classes',
                        type=int,
                        default=2,
                        nargs='?',
                        help='number of total classes')
    args = parser.parse_args()

    #INPUTS
    print()
    print('INPUTS')
    file_path = args.file_path
    print('filepath: ' + file_path)
    fraction_of_data_for_learning = args.fraction
    print('fraction of data to train: ', fraction_of_data_for_learning)
    number_of_classes = args.num_classes
    print('number of classes: ', number_of_classes)

    #READ INPUT DATA
    input_data = FileManager.get_csv_file_data_array(file_path)
    print('number of input vectors: ', len(input_data))

    #SPLIT INPUT DATA (learning & test sets)
    print()
    data_sets = DataManipulator.split_data_in_2_randomly(
        input_data, fraction_of_data_for_learning)
    learning_data = data_sets[0]
    print('learning data size: ', len(learning_data))
    test_data = data_sets[1]
    print('test data size: ', len(test_data))
    print()

    #LEARN THE MODELS

    #Winnow2
    winnow2 = Winnow2()  # default values for alpha, threshold, & start weight
    winnow2_learned_weights = winnow2.learn_winnow2_model(learning_data)
    print('Winnow2 learned weights:')
    print(winnow2_learned_weights)
    print()

    #Naive Bayes
    naive_bayes = NaiveBayes(number_of_classes)
    naive_bayes_learned_percents = naive_bayes.learn_naive_bayes_model(
        learning_data)
    print(
        'Naive Bayes learned percentages as input[ class[ (prob0, prob1) ] ]')
    print(naive_bayes_learned_percents)
    print()

    #TEST THE MODELS

    #Winnow2
    print('Testing Winnow2 model')
    winnow2_test_results = winnow2.test_winnow2_model(test_data)
    print('classification attempts(', winnow2_test_results[0], '), \
#fails(', winnow2_test_results[1], '), \
#success(', winnow2_test_results[2], ')')
    print()

    #Naive Bayes
    print('Testing Naive Bayes model')
    naive_bayes_test_results = naive_bayes.test_naive_bayes_model(test_data)
    print('#classification attempts(', naive_bayes_test_results[0], '), \
#fails(', naive_bayes_test_results[1], '), \
#success(', naive_bayes_test_results[2], ')')
    print()
def run_models_with_cross_validation(number_of_layers, nodes_per_layer,
                                     learning_rate, max_epoch, error_thresh):

    #GET DATA
    #- expect data_0 ... data_4
    data_groups = list()
    data_type = 'int'
    data_groups.append(FileManager.get_csv_file_data_array(
        'data_0', data_type))
    data_groups.append(FileManager.get_csv_file_data_array(
        'data_1', data_type))
    data_groups.append(FileManager.get_csv_file_data_array(
        'data_2', data_type))
    data_groups.append(FileManager.get_csv_file_data_array(
        'data_3', data_type))
    data_groups.append(FileManager.get_csv_file_data_array(
        'data_4', data_type))

    model1_culminating_result = 0
    model1_final_average_result = 0

    NUM_GROUPS = len(data_groups)
    #For each data_group, train on all others and test on me
    for test_group_id in range(NUM_GROUPS):
        print()
        #Form training data as 4/5 data
        train_data = list()
        for train_group_id in range(len(data_groups)):
            if (train_group_id != test_group_id):
                #Initialize train_data if necessary
                if (len(train_data) == 0):
                    train_data = data_groups[train_group_id]
                else:
                    train_data = train_data + data_groups[train_group_id]

        print('train_data group', str(test_group_id), 'length: ',
              len(train_data))
        #print(train_data)

        test_data = data_groups[test_group_id]
        test_data = pd.DataFrame(test_data)

        train_data = pd.DataFrame(train_data)
        print(train_data.head())
        model1 = NeuralNetworkFFBP(train_data, number_of_layers,
                                   nodes_per_layer)
        model1.train(learning_rate, max_epoch, error_thresh)

        print_classifications = False
        if (test_group_id == 0
            ):  #Required to print classifications for one fold
            print_classifications = True
        model1_result = model1.test(
            test_data,
            print_classifications)  # returns (attempts, fails, success)
        model1_accuracy = (model1_result[0] / model1_result[1]) * 100
        print('Accuracy:', model1_accuracy, '%')
        model1_culminating_result = model1_culminating_result + model1_accuracy

    model1_final_average_result = model1_culminating_result / NUM_GROUPS
    #print()
    #print('final average result:')
    #print(final_average_result)
    #print()

    return (model1_final_average_result)
def run_models_with_cross_validation(num_classes=2, learning_rate=0.5):

    #GET DATA
    #- expect data_0 ... data_4
    data_groups = list()
    data_type = 'int'
    data_groups.append(FileManager.get_csv_file_data_array(
        'data_0', data_type))
    data_groups.append(FileManager.get_csv_file_data_array(
        'data_1', data_type))
    data_groups.append(FileManager.get_csv_file_data_array(
        'data_2', data_type))
    data_groups.append(FileManager.get_csv_file_data_array(
        'data_3', data_type))
    data_groups.append(FileManager.get_csv_file_data_array(
        'data_4', data_type))

    NUM_GROUPS = len(data_groups)

    #For each data_group, train on all others and test on me
    model1_culminating_result = 0
    model2_culminating_result = 0
    model1_final_average_result = 0
    model2_final_average_result = 0

    for test_group_id in range(NUM_GROUPS):
        print()
        #Form training data as 4/5 data
        train_data = list()
        for train_group_id in range(len(data_groups)):
            if (train_group_id != test_group_id):
                #Initialize train_data if necessary
                if (len(train_data) == 0):
                    train_data = data_groups[train_group_id]
                else:
                    train_data = train_data + data_groups[train_group_id]

        print('train_data group', str(test_group_id), 'length: ',
              len(train_data))
        #print(train_data)

        test_data = data_groups[test_group_id]

        model1_result = 0
        model2_result = 0
        model1 = NaiveBayes(num_classes)
        model2 = LogisticRegression(pd.DataFrame(train_data))
        model1.train(train_data)
        model2.train(pd.DataFrame(train_data), learning_rate)
        print_classifications = False
        if (test_group_id == 0
            ):  #Required to print classifications for one fold
            print_classifications = True
        model1_result = model1.test(
            test_data,
            print_classifications)  # returns (attempts, fails, success)
        #print('result:', result)
        model1_accuracy = (model1_result[2] / model1_result[0]) * 100
        print('Naive Bayes Accuracy (%):', model1_accuracy)
        model2_result = model2.test(
            pd.DataFrame(test_data),
            print_classifications)  # returns (% accuracy)
        print('Logistic Regression Accuracy (%):', model2_result)
        model1_culminating_result = model1_culminating_result + model1_accuracy
        model2_culminating_result = model2_culminating_result + model2_result

    model1_final_average_result = model1_culminating_result / NUM_GROUPS
    model2_final_average_result = model2_culminating_result / NUM_GROUPS
    #print()
    #print('final average result:')
    #print(final_average_result)
    #print()

    return (model1_final_average_result, model2_final_average_result)