Ejemplo n.º 1
0
def process_house_votes_data(file_path):
	#print('LOG: process_house_votes_data() START')
	#print(file_path)

	original_data = FileManager.get_csv_file_data_array(file_path)
	#print('LOG: ORIGINAL data')
	#print(original_data[0])

	#print('LOG: Class moved to final column')
	original_col_with_class = 0
	columns_moved_data = DataManipulator.move_column_to_end(original_data, original_col_with_class)
	#print(columns_moved_data[0])

	#print('LOG: group input values into bins')
	data_in_bins = _bin_input_attributes(columns_moved_data)
	#print(data_in_bins[0])

	#print('LOG: turn input features into binary values (0,1)')
	num_bins = 3
	col_idx = 0
	final_data = DataManipulator.expand_attributes_to_binary_values(data_in_bins, col_idx, num_bins)
	for col in data_in_bins[0]:
		col_idx += num_bins #This is because columns are inserted each itr
		#print('length of final_data')
		#print(len(final_data[0]))
		if col_idx == (len(final_data[0]) - 1): #Skip last col b/c it's the class value
			#print('LOG: Stop here - skip the final column which is classification')
			break
		final_data = DataManipulator.expand_attributes_to_binary_values(final_data, col_idx, num_bins)
	#print(final_data[0])
	return final_data
def process_house_votes_data(file_path):
    #print('LOG: process_house_votes_data() START')
    #print(file_path)

    original_data = FileManager.get_csv_file_data_array(file_path)
    #print('LOG: ORIGINAL data')
    #print(original_data[0])

    #print('LOG: group input values into bins')
    num_bins = 4
    #Fix sepal length 4.3 - 7.9
    column = 0
    min_val = 4.3
    max_val = 7.9
    data_in_bins = \
     _bin_input_attribute(original_data, column, min_val, max_val, num_bins)
    #Fix sepal width 2.0 - 4.4
    column = 1
    min_val = 2.0
    max_val = 4.4
    data_in_bins = \
     _bin_input_attribute(original_data, column, min_val, max_val, num_bins)
    #Fix petal length 1.0 - 6.9
    column = 2
    min_val = 1.0
    max_val = 6.9
    data_in_bins = \
     _bin_input_attribute(original_data, column, min_val, max_val, num_bins)
    #Fix petal width 0.1 - 2.5
    column = 3
    min_val = 0.1
    max_val = 2.5
    data_in_bins = \
     _bin_input_attribute(original_data, column, min_val, max_val, num_bins)
    #print()
    #print(data_in_bins)
    #print()

    #print('LOG: turn input features into binary values (0,1)')
    col_idx = 0
    final_data = DataManipulator.expand_attributes_to_binary_values(
        data_in_bins, col_idx, num_bins)
    for col in data_in_bins[0]:
        col_idx += num_bins  #This is because columns are inserted each itr
        #print('length of final_data')
        #print(len(final_data[0]))
        if col_idx == (len(final_data[0]) -
                       1):  #Skip last col b/c it's the class value
            #print('LOG: Stop here - skip the final column which is classification')
            break
        final_data = DataManipulator.expand_attributes_to_binary_values(
            final_data, col_idx, num_bins)

    #print(final_data[0])
    return final_data
Ejemplo n.º 3
0
def process_house_votes_data(file_path):
    #print('LOG: process_house_votes_data() START')
    #print(file_path)

    original_data = FileManager.get_csv_file_data_array(file_path)
    #print('LOG: ORIGINAL data')
    #print(original_data[0])

    #Eliminate first column of data, convert class from 2,4 -> 0,1 && make bins are 0-9, not 1-10
    modified_data = list()
    number_of_rows = len(original_data)
    for vector_idx in range(0, number_of_rows, 1):
        tmplist = original_data[vector_idx][1:]
        number_of_cols = len(tmplist)
        for col_idx in range(0, number_of_cols, 1):
            if col_idx == (number_of_cols - 1):  #classificiaton value {2,4}
                #Classification value, {2,4} -> {0,1}
                if tmplist[col_idx] == 2:
                    tmplist[col_idx] = 0
                elif tmplist[col_idx] == 4:
                    tmplist[col_idx] = 1
                else:
                    print('ERROR: Expected {2,4} but instead got ',
                          tmplist[col_idx])
                    #no modification made
            else:  #normal bin value
                tmplist[col_idx] = tmplist[col_idx] - 1  #bins 1-10 -> 0-9

        modified_data.append(tmplist)

    #print()
    #print(modified_data)
    #print()

    #print('LOG: turn input features into binary values (0,1)')
    num_bins = 10
    col_idx = 0
    final_data = DataManipulator.expand_attributes_to_binary_values(
        modified_data, col_idx, num_bins)
    for col in modified_data[0]:
        col_idx += num_bins  #This is because columns are inserted each itr
        #print('length of final_data')
        #print(len(final_data[0]))
        if col_idx == (len(final_data[0]) -
                       1):  #Skip last col b/c it's the class value
            #print('LOG: Stop here - skip the final column which is classification')
            break
        final_data = DataManipulator.expand_attributes_to_binary_values(
            final_data, col_idx, num_bins)
    #print(final_data[0])
    return final_data
def main():
    parser = argparse.ArgumentParser(description='One-hot-code data')
    parser.add_argument('input_path', type=str, help='full path to input file')
    parser.add_argument('output_path',
                        type=str,
                        help='full path to output file')
    parser.add_argument('number_max_bins',
                        type=int,
                        help='number of max bins for output data')
    args = parser.parse_args()
    input_path = args.input_path
    output_file = args.output_path
    num_bins = args.number_max_bins

    data_frame = FileManager.get_csv_file_data_pandas(input_path)
    print('data frame head:')
    print(data_frame.head(3))

    hot_coded_data_frame = DataManipulator.one_hot_code(data_frame, num_bins)
    print('data one hot coded head:')
    print(hot_coded_data_frame.head(3))

    hot_coded_data_frame.to_csv(output_file,
                                header=None,
                                index=None,
                                sep=',',
                                mode='a')
Ejemplo n.º 5
0
def main():
    print("Crypto Currency Trade Suggestion Bot")
    print("*Starting Pulling Data*")
    os.remove('currency_data.txt')
    os.system('scrapy runspider currency_attribute_getter.py')
    print("Data has been Updated")
    print('Best Trade Options')
    DataManipulator()
def split_data(file_path, is_random, num_groups=5, separator=','):

    #GET DATA
    original_data = FileManager.get_csv_file_data_pandas(file_path, separator)
    #print('original data')
    #print(original_data)

    #STANDARD STUFF

    #Split the data into 5 groups
    groups = list()
    num_groups = int(num_groups)

    if (is_random == True):
        #Use basic random 5-way split
        groups = DataManipulator.split_data_randomly(original_data, num_groups)
    else:
        #Use more complex split
        groups = DataManipulator.split_data_randomly_accounting_for_class(
            original_data, num_groups)

    return groups
Ejemplo n.º 7
0
def main():
    #Move class column to final col

    parser = argparse.ArgumentParser(
        description='Move the given column to be the last column')
    parser.add_argument('file_path_in',
                        type=str,
                        help='full path to input file')
    parser.add_argument('column', type=int, help='the column to remove')
    parser.add_argument('file_path_out',
                        type=str,
                        help='full path to output file')
    args = parser.parse_args()
    column = args.column

    data = FileManager.get_csv_file_data_numpy(args.file_path_in, ',')
    data = data.astype(np.float)

    data_as_np = DataManipulator.move_np_column_to_end(data, column)
    np.savetxt(args.file_path_out, data_as_np, delimiter=',')
    '''
def main():
    #print('LOG: Main program to run tests')

    parser = argparse.ArgumentParser(
        description='Learn & Verify machine learning algorithms')
    parser.add_argument('all_class_filepath',
                        type=str,
                        help='full path to input file')
    parser.add_argument('class_0_filepath',
                        type=str,
                        help='full path to input file')
    parser.add_argument('class_1_filepath',
                        type=str,
                        help='full path to input file')
    parser.add_argument('class_2_filepath',
                        type=str,
                        help='full path to input file')
    parser.add_argument('fraction',
                        type=float,
                        default=0.66,
                        nargs='?',
                        help='fraction of data to learn')
    parser.add_argument('num_classes',
                        type=int,
                        default=3,
                        nargs='?',
                        help='number of total classes')
    args = parser.parse_args()

    #INPUTS
    print()
    print('INPUTS')
    allclass_filepath = args.all_class_filepath
    print('overall_model_file_path: ' + allclass_filepath)
    class0_filepath = args.class_0_filepath
    print('class0_filepath: ' + class0_filepath)
    class1_filepath = args.class_1_filepath
    print('class1_filepath: ' + class1_filepath)
    class2_filepath = args.class_2_filepath
    print('class2_filepath: ' + class2_filepath)
    fraction_of_data_for_learning = args.fraction
    print('fraction of data to train: ', fraction_of_data_for_learning)
    number_of_classes = args.num_classes
    print('number of classes: ', number_of_classes)

    #READ INPUT DATA
    allclass_data = FileManager.get_csv_file_data_array(allclass_filepath)
    print('number of input vectors: ', len(allclass_data))
    class0_data = FileManager.get_csv_file_data_array(class0_filepath)
    class1_data = FileManager.get_csv_file_data_array(class1_filepath)
    class2_data = FileManager.get_csv_file_data_array(class2_filepath)

    #SPLIT INPUT DATA (learning & test sets)
    print()
    allclass_data_sets = DataManipulator.split_data_in_2_randomly(
        allclass_data, fraction_of_data_for_learning)
    class0_data_sets = DataManipulator.split_data_in_2_randomly(
        class0_data, fraction_of_data_for_learning)
    class1_data_sets = DataManipulator.split_data_in_2_randomly(
        class1_data, fraction_of_data_for_learning)
    class2_data_sets = DataManipulator.split_data_in_2_randomly(
        class2_data, fraction_of_data_for_learning)

    allclass_learning_data = allclass_data_sets[0]
    print('learning data size: ', len(allclass_learning_data))
    allclass_test_data = allclass_data_sets[1]
    print('test data size: ', len(allclass_test_data))
    print()

    class0_learning_data = class0_data_sets[0]
    class0_test_data = class0_data_sets[1]
    class1_learning_data = class1_data_sets[0]
    class1_test_data = class1_data_sets[1]
    class2_learning_data = class2_data_sets[0]
    class2_test_data = class2_data_sets[1]

    #LEARN THE MODELS

    #Winnow2
    winnow2_0 = Winnow2()
    winnow2_1 = Winnow2()
    winnow2_2 = Winnow2()

    winnow2_0_learned_weights = winnow2_0.learn_winnow2_model(
        class0_learning_data)
    winnow2_1_learned_weights = winnow2_1.learn_winnow2_model(
        class1_learning_data)
    winnow2_2_learned_weights = winnow2_2.learn_winnow2_model(
        class2_learning_data)
    print('Winnow2 learned weights for class 0:')
    print(winnow2_0_learned_weights)
    print()
    print('Winnow2 learned weights for class 1:')
    print(winnow2_1_learned_weights)
    print()
    print('Winnow2 learned weights for class 2:')
    print(winnow2_2_learned_weights)
    print()

    #Naive Bayes
    naive_bayes = NaiveBayes(number_of_classes)
    naive_bayes_learned_percents = naive_bayes.learn_naive_bayes_model(
        allclass_learning_data)
    print(
        'Naive Bayes learned percentages as input[ class[ (prob0, prob1) ] ]')
    print(naive_bayes_learned_percents)
    print()

    #TEST THE MODELS

    #Winnow2
    print('Testing Winnow2 model')
    winnow2_multi_model_test_results = Winnow2.test_multiple_winnow2_models(
        allclass_test_data, [winnow2_0, winnow2_1, winnow2_2])
    print('classification attempts(', winnow2_multi_model_test_results[0],
          '), \
#fails(', winnow2_multi_model_test_results[1], '), \
#success(', winnow2_multi_model_test_results[2], ')')
    print()

    #Naive Bayes
    print('Testing Naive Bayes model')
    naive_bayes_test_results = naive_bayes.test_naive_bayes_model(
        allclass_test_data)
    print('#classification attempts(', naive_bayes_test_results[0], '), \
#fails(', naive_bayes_test_results[1], '), \
#success(', naive_bayes_test_results[2], ')')
    print()
def main():
    #print('LOG: Main program to run tests')

    parser = argparse.ArgumentParser(
        description='Learn & Verify machine learning algorithms')
    parser.add_argument('file_path', type=str, help='full path to input file')
    parser.add_argument('fraction',
                        type=float,
                        default=0.66,
                        nargs='?',
                        help='fraction of data to learn')
    parser.add_argument('num_classes',
                        type=int,
                        default=2,
                        nargs='?',
                        help='number of total classes')
    args = parser.parse_args()

    #INPUTS
    print()
    print('INPUTS')
    file_path = args.file_path
    print('filepath: ' + file_path)
    fraction_of_data_for_learning = args.fraction
    print('fraction of data to train: ', fraction_of_data_for_learning)
    number_of_classes = args.num_classes
    print('number of classes: ', number_of_classes)

    #READ INPUT DATA
    input_data = FileManager.get_csv_file_data_array(file_path)
    print('number of input vectors: ', len(input_data))

    #SPLIT INPUT DATA (learning & test sets)
    print()
    data_sets = DataManipulator.split_data_in_2_randomly(
        input_data, fraction_of_data_for_learning)
    learning_data = data_sets[0]
    print('learning data size: ', len(learning_data))
    test_data = data_sets[1]
    print('test data size: ', len(test_data))
    print()

    #LEARN THE MODELS

    #Winnow2
    winnow2 = Winnow2()  # default values for alpha, threshold, & start weight
    winnow2_learned_weights = winnow2.learn_winnow2_model(learning_data)
    print('Winnow2 learned weights:')
    print(winnow2_learned_weights)
    print()

    #Naive Bayes
    naive_bayes = NaiveBayes(number_of_classes)
    naive_bayes_learned_percents = naive_bayes.learn_naive_bayes_model(
        learning_data)
    print(
        'Naive Bayes learned percentages as input[ class[ (prob0, prob1) ] ]')
    print(naive_bayes_learned_percents)
    print()

    #TEST THE MODELS

    #Winnow2
    print('Testing Winnow2 model')
    winnow2_test_results = winnow2.test_winnow2_model(test_data)
    print('classification attempts(', winnow2_test_results[0], '), \
#fails(', winnow2_test_results[1], '), \
#success(', winnow2_test_results[2], ')')
    print()

    #Naive Bayes
    print('Testing Naive Bayes model')
    naive_bayes_test_results = naive_bayes.test_naive_bayes_model(test_data)
    print('#classification attempts(', naive_bayes_test_results[0], '), \
#fails(', naive_bayes_test_results[1], '), \
#success(', naive_bayes_test_results[2], ')')
    print()