def test_naive_bayes_classifier_predict(): train = [[1, 5], [2, 6], [1, 5], [1, 5], [1, 6], [2, 6], [1, 5], [1, 6]] y = ["yes", "yes", "no", "no", "yes", "no", "yes", "yes"] nb = MyNaiveBayesClassifier() nb.fit(train, y) pred = nb.predict([[1, 5]]) assert pred == ["yes"] # TODO: fix this # RQ5 (fake) iPhone purchases dataset iphone_col_names = [ "standing", "job_status", "credit_rating", "buys_iphone" ] iphone_table = [[1, 3, "fair", "no"], [1, 3, "excellent", "no"], [2, 3, "fair", "yes"], [2, 2, "fair", "yes"], [2, 1, "fair", "yes"], [2, 1, "excellent", "no"], [2, 1, "excellent", "yes"], [1, 2, "fair", "no"], [1, 1, "fair", "yes"], [2, 2, "fair", "yes"], [1, 2, "excellent", "yes"], [2, 2, "excellent", "yes"], [2, 3, "fair", "yes"], [2, 2, "excellent", "no"], [2, 3, "fair", "yes"]] mypy = MyPyTable(iphone_col_names, iphone_table) y2 = myutils.get_mypycol(mypy, "buys_iphone") nb2 = MyNaiveBayesClassifier() nb2.fit(iphone_table, y2) pred2 = nb2.predict([[1, 2, "fair"]]) assert pred2 == ["yes"] # Bramer 3.2 train dataset train_col_names = ["day", "season", "wind", "rain", "class"] train_table = [["weekday", "spring", "none", "none", "on time"], ["weekday", "winter", "none", "slight", "on time"], ["weekday", "winter", "none", "slight", "on time"], ["weekday", "winter", "high", "heavy", "late"], ["saturday", "summer", "normal", "none", "on time"], ["weekday", "autumn", "normal", "none", "very late"], ["holiday", "summer", "high", "slight", "on time"], ["sunday", "summer", "normal", "none", "on time"], ["weekday", "winter", "high", "heavy", "very late"], ["weekday", "summer", "none", "slight", "on time"], ["saturday", "spring", "high", "heavy", "cancelled"], ["weekday", "summer", "high", "slight", "on time"], ["saturday", "winter", "normal", "none", "late"], ["weekday", "summer", "high", "none", "on time"], ["weekday", "winter", "normal", "heavy", "very late"], ["saturday", "autumn", "high", "slight", "on time"], ["weekday", "autumn", "none", "heavy", "on time"], ["holiday", "spring", "normal", "slight", "on time"], ["weekday", "spring", "normal", "none", "on time"], ["weekday", "spring", "normal", "slight", "on time"]] mypy2 = MyPyTable(train_col_names, train_table) y3 = myutils.get_mypycol(mypy2, "class") nb3 = MyNaiveBayesClassifier() nb3.fit(train_table, y3) nb3.fit(train_table, y3) pred3 = nb3.predict([["weekday", "winter", "high", "heavy"]]) assert pred3 == ["cancelled"]
def test_random_forest_fit(): interview_header = ["level", "lang", "tweets", "phd", "interviewed_well"] interview_table = [["Senior", "Java", "no", "no", "False"], ["Senior", "Java", "no", "yes", "False"], ["Mid", "Python", "no", "no", "True"], ["Junior", "Python", "no", "no", "True"], ["Junior", "R", "yes", "no", "True"], ["Junior", "R", "yes", "yes", "False"], ["Mid", "R", "yes", "yes", "True"], ["Senior", "Python", "no", "no", "False"], ["Senior", "R", "yes", "no", "True"], ["Junior", "Python", "yes", "no", "True"], ["Senior", "Python", "yes", "yes", "True"], ["Mid", "Python", "no", "yes", "True"], ["Mid", "Java", "yes", "no", "True"], ["Junior", "Python", "no", "yes", "False"]] myutils.prepend_attribute_label(interview_table, interview_header) interview_pytable = MyPyTable(column_names=interview_header, data=interview_table) y_col = interview_pytable.get_column("interviewed_well", False) x_cols = interview_pytable.drop_col("interviewed_well") many_trees = MyRandomForestClassifier() X_sample, y_sample = myutils.compute_bootstrapped_sample(x_cols, y_col) X_train, X_test, y_train, y_test = myutils.train_test_split( X_sample, y_sample, .33) many_trees.fit(X_train, y_train, X_test, y_test) y_predicted = many_trees.predict(X_test) numCorrectPredictions = 0 numWrongPredictions = 0 for i in range(len(y_test)): values = [y_predicted[i], y_test[i]] #predicted/actual if (values[0] == values[1]): numCorrectPredictions = numCorrectPredictions + 1 else: numWrongPredictions = numWrongPredictions + 1 accuracy = np.round((numCorrectPredictions) / (numCorrectPredictions + numWrongPredictions), 3) error_rate = np.round( (numWrongPredictions) / (numCorrectPredictions + numWrongPredictions), 3) print("-----------------------------------------------------------") print("Accuracy and Error Rate") print("-----------------------------------------------------------") print() print("Random Forest: accuracy = {}, error rate = {}".format( accuracy, error_rate)) print() print( "Because of the random aspect of this classifier, this will not always pass the tests" ) print() print("Predicted table: " + str(y_predicted)) print("Testing set: " + str(y_test)) for i in range(len(y_test)): assert y_predicted[i] == y_test[i]
def test_My_Random_Forest_Classifier_predict(): # Object Declarations # Tests with N = 3, M = 2, F = 2 and seed = 1 rand_forest_test = MyRandomForestClassifier(3, 2, 2, 1) table = MyPyTable() # Variable Assignment and Declaration table.data = interview_table table.column_names = interview_header y_train, X_train = [], [] for inst in interview_table: y_train.append(inst[-1]) X_train.append(inst[:-1]) # Sets X_test X_test = [["Junior", "Java", "yes", "no"], ["Junior", "Java", "yes", "yes"]] # Tests on the Interview Dataset rand_forest_test.header = interview_header[:-1] rand_forest_test.fit(X_train, y_train) y_predicted = rand_forest_test.predict(X_test) print("y_predicted:", y_predicted) # Trace Test assert y_predicted == ['True', 'False']
def confusionCategorical(yTrue, yTest, header, categories): table = MyPyTable() table.column_names = header table.data = [] for val in categories: newRow = [val] for i in range(len(header) - 1): newRow.append(0) table.data.append(newRow) for i in range(len(yTrue)): rowIndex = categories.index(yTrue[i]) colIndex = header.index(yTest[i]) table.data[rowIndex][colIndex] += 1 for row in table.data: total = 0 for i in range(1, len(categories) + 1): total += row[i] row[len(categories) + 1] = total for i in range(len(table.data)): if table.data[i][len(categories) + 1] != 0: recognition = table.data[i][i + 1] / table.data[i][len(categories) + 1] table.data[i][len(header) - 1] = round(100 * recognition, 2) return table
def table_setUp(file_name): """ """ file_path = os.path.join("input_data", file_name) # Inputs data from file into the table table = MyPyTable().load_from_file(file_path) return table
def bagging(X, Y, N, M, F): # 1. split your dataset into a test set and a "remainder set" x_remainder, x_test, y_r, y_test = myevaluation.train_test_split(X, Y) # 2. using the remainder set, sample N bootsrap samples and use each one to build a classifier # for each N sample: # ~63% of the remainder set will be sampled into training set # ~37% will be leftover for this tree's validation set forest = [] # accuracies = [[0] for i in range(N)] accuracies = {} for i in range(N): x_train, y_train = compute_bootstrapped_sample( x_remainder, y_r) #get the bootstrap sample tree = my_class.MyDecisionTreeClassifier() tree.fit(x_train, y_train, True, F) #build classifier # get remainder of x_train and use as validation set x_v = [] y_v = [] for j in range(len(x_remainder)): if x_remainder[j] not in x_train: x_v.append(x_remainder[j]) y_v.append(y_r[j]) pred = tree.predict(x_v) accuracy = get_accuracy(y_v, pred) accuracies[str(i)] = accuracy # {i: accuracy, } forest.append(tree) # 3. measure the performance of the tree on the validation set and select the best M of N # trees based on the performance metrics best_trees_dict = best_M(M, accuracies) best_trees = [] for key in best_trees_dict: best_trees.append(forest[int(key)]) # 4. using majority voting, make predictions from the M learners for each instance in the test set all_predictions = [] # [[predictions1],[predictions2]...] for tree in best_trees: pred = tree.predict(x_test) all_predictions.append(pred) #think about this like flipping a table #get the majority for every single row pred_header = build_header( all_predictions) #turn all predictions into a mypy pred_mypy = MyPyTable(pred_header, all_predictions) voted_predictions = [] for i in range( len(all_predictions[0]) ): #loop through every x_test, create a column of predictions, pick the pred by majority rule pred_col = pred_mypy.get_column(i) vals, counts = get_freq_str(pred_col) j = counts.index(max(counts)) y_predict = vals[j] voted_predictions.append(y_predict) forest_accuracy = get_accuracy(y_test, voted_predictions) return best_trees, voted_predictions, forest_accuracy
def test_random_forest_classifier_fit(): mp_table = MyPyTable(interview_header, interview_table) # Formulate X_train and y_train y_train = mp_table.get_column('interviewed_well') X_train_col_names = ["level", "lang", "tweets", "phd"] X_train = mp_table.get_rows(X_train_col_names) myRF = MyRandomForestClassifier(N=4, M=2, F=4) myRF.fit(X_train, y_train) assert len(myRF.M_attr_sets) == myRF.M
def combine_two_columns(column_names, col1, col2): """Creates a MyPyTable from two columns and their column names Args: column_names(list): List of string column names col1(list): List of values from first column col2(list): List of values from second column Returns: table(MyPyTable): Returned MyPyTable with two columns""" data = [] for i in range(len(col1)): data.append([col1[i], col2[i]]) table = MyPyTable(column_names, data) return table
def test_My_Random_Forest_Classifier_fit(): # Object Declarations # Tests with N = 3, M = 2, F = 2 and seed = 0 rand_forest_test = MyRandomForestClassifier(3, 2, 2, 0) table = MyPyTable() # Variable Assignment and Declaration table.data = interview_table table.column_names = interview_header X_test = interview_table y_train = table.get_column("interviewed_well") # Tests on the Interview Dataset rand_forest_test.header = interview_header rand_forest_test.fit(X_test, y_train) trees = rand_forest_test.trees
def random_forest_predict(X_test, trees): # 4. using majority voting, make predictions from the M learners for each instance in the test set all_predictions = [] # [[predictions1],[predictions2]...] for tree in trees: pred = tree.predict(X_test) all_predictions.append(pred) #think about this like flipping a table #get the majority for every single row pred_header = build_header(all_predictions) #turn all predictions into a mypy pred_mypy = MyPyTable(pred_header, all_predictions) voted_predictions = [] for i in range(len(all_predictions[0])): #loop through every x_test, create a column of predictions, pick the pred by majority rule pred_col = pred_mypy.get_column(i) vals, counts = get_freq_str(pred_col) j = counts.index(max(counts)) y_predict = vals[j] voted_predictions.append(y_predict) # forest_accuracy = get_accuracy(y_test, voted_predictions) return voted_predictions
def test_random_forest_classifier_predict(): X_test = [["Mid", "Python", "no", "no", "True"], ["Mid", "R", "yes", "yes", "True"], ["Mid", "Python", "no", "yes", "True"]] y_test = ["True", "True", "True"] mp_table = MyPyTable(interview_header, interview_table) # Formulate X_train and y_train y_train = mp_table.get_column('interviewed_well') X_train_col_names = ["level", "lang", "tweets", "phd"] X_train = mp_table.get_rows(X_train_col_names) myRF = MyRandomForestClassifier(N=4, M=2, F=4) myRF.fit(X_train, y_train) predictions = myRF.predict(X_test) for i in range(0, len(predictions)): assert predictions[i] == y_test[i]
def get_freq_str(col): header = ["y"] col_mypy = MyPyTable(header, col) dups = col_mypy.ordered_col(header) values = [] counts = [] for value in dups: if value not in values: # first time we have seen this value values.append(str(value)) counts.append(1) else: # we have seen this value before counts[-1] += 1 # ok because the list is sorted return values, counts
def compute_entropy(instances, available_attributes, index): mypy = MyPyTable(available_attributes, instances) classes = mypy.get_column(-1) attributes = mypy.get_column(index) temp = set(attributes) __, tables = group_by(attributes, classes) totals = [] sub_entropies = [] # get the class counts here for jj, element in enumerate(temp): totals.append(attributes.count(element)) # parallel array of counts of each att for each class arr = [] for table in tables: arr.append(table.count(element)) su = 0 for kk in arr: if kk <= 0: pass else: su -= kk / totals[jj] * math.log2(kk / totals[jj]) su *= totals[jj] / len(attributes) sub_entropies.append(su) return sum(sub_entropies)
track_data = [] popularity = track["track"]["popularity"] if popularity == 0: continue #skip any track with 0 popularity, because I'm unsure if this is a default value else: #name = track["track"]["name"] #track_data.append(name) # will be ignored but could but each track_data_obj should be identifiable features_dict = sp.audio_features(track["track"]["id"]) #this returns a features dictonary for key in features_dict[0]: #loop through and add only the attributes we want if key != "type" and key != "id" and key != "uri" and key != "track_href" and key != "analysis_url" and key != "time_signature" and key != "mode" and key != "key" and key != "loudness": val = features_dict[0][key] if key != "tempo" and key != "duration_ms": val = myutils.percent_to_rating(val) track_data.append(val) # if first == True: # header.append(key) # first = False pop_class = myutils.pop_rating(popularity) track_data.append(pop_class) # popularity will be the y_train track_data_objs.append(track_data) # header.append("popularity") # now we can turn this into an xtrain and ytrain or keep it stitched together # when dealing with the data we can delete the first col, which is the name identifier print(len(track_data_objs)) header = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'popularity'] tracks_mypy = MyPyTable(header, track_data_objs) tracks_mypy.save_to_file("tracks_data.txt")
from mysklearn.myclassifiers import MyNaiveBayesClassifier import os from mysklearn.mypytable import MyPyTable import mysklearn.myevaluation as myevaluation import mysklearn.myutils as myutils import pickle fname = os.path.join("input_data", "collisions.csv") collisions_data = MyPyTable().load_from_file(fname) weather = collisions_data.get_column('WEATHER') road_condition = collisions_data.get_column('ROADCOND') light_condition = collisions_data.get_column('LIGHTCOND') junction_type = collisions_data.get_column('JUNCTIONTYPE') severity = collisions_data.get_column('SEVERITYDESC') X_train = [[ weather[i], road_condition[i], light_condition[i], junction_type[i], severity[i] ] for i in range(len(weather))] y_train = collisions_data.get_column('COLLISIONTYPE') for i, val in enumerate(y_train): if val == 'Unknown': del y_train[i] del X_train[i] strattrain_folds, strattest_folds = myevaluation.stratified_kfold_cross_validation( X_train, y_train, 10) strat_xtrain, strat_ytrain, strat_xtest, strat_ytest = myutils.get_from_folds( X_train, y_train, strattrain_folds, strattest_folds)
from mysklearn.myclassifiers import MyKNeighborsClassifier import os from mysklearn.mypytable import MyPyTable import mysklearn.myevaluation as myeval import mysklearn.myutils as myutils import pickle # Importing the data and table and cols movies_fname = os.path.join("input_data", "movies.csv") # movie_data = MyPyTable().load_from_file_no_encode(movies_fname) movies_table = MyPyTable().load_from_file(movies_fname, encode='cp1252') # Getting profit gross_profit = [ movies_table.get_column('gross')[i] - movies_table.get_column('budget')[i] for i in range(len(movies_table.data)) ] profitted = [0 if gross < 0 else 1 for gross in gross_profit] movies_table.add_column(profitted, 'profitted') # fit the KNN algorithm to the movies data kn_class = MyKNeighborsClassifier() feature_cols = [ 'budget', 'votes', 'genre', 'rating', 'score', 'star', 'director', 'writer' ] features = movies_table.get_key_columns(feature_cols) outcomes = profitted kn_class.fit(features, outcomes) packaged_object = kn_class
import pickle # standard python library from mysklearn.mypytable import MyPyTable from mysklearn.myclassifiers import MyDecisionTreeClassifier, MyNaiveBayesClassifier import mysklearn.myevaluation as myevaluation import mysklearn.myutils as myutils import os # "pickle" an object (AKA object serialization) # save a Python object to a binary file # "unpickle" an object (AKA object de-serialization) # load a Python object from a binary file (back into memory) # Get data from csv file table = MyPyTable().load_from_file( os.path.join("input_files", "winequality-red.csv")) y_col = table.get_column("quality", False) x_cols = table.drop_col("quality") # Use Naive Bayes to classify testcase = MyNaiveBayesClassifier() #Returns x INDEXES X_train, X_test = myevaluation.stratified_kfold_cross_validation(x_cols, y_col, n_splits=10) X_train, X_test, y_train, y_test = myutils.getInstances( X_train, X_test, x_cols, y_col) for i, fold in enumerate(X_train): train, test = myutils.normalize_values(X_train[i], X_test[i])
def load_data(filename): data_path = os.path.join("input_data", filename) table = MyPyTable().load_from_file(data_path) return table
def test_naive_bayes_classifier_fit(): train = [[1, 5], [2, 6], [1, 5], [1, 5], [1, 6], [2, 6], [1, 5], [1, 6]] y = ["yes", "yes", "no", "no", "yes", "no", "yes", "yes"] nb = MyNaiveBayesClassifier() nb.fit(train, y) assert nb.priors == [["yes", 5 / 8], ["no", 3 / 8]] assert nb.posteriors == [[0, ['yes', ['1', 0.8], ['2', 0.2]], ['no', ['1', 2/3], ['2', 1/3]]], \ [1, ['yes', ['5', 0.4], ['6', 0.6]], ['no', ['5', 2/3], ['6', 1/3]]]] # RQ5 (fake) iPhone purchases dataset iphone_col_names = [ "standing", "job_status", "credit_rating", "buys_iphone" ] iphone_table = [[1, 3, "fair", "no"], [1, 3, "excellent", "no"], [2, 3, "fair", "yes"], [2, 2, "fair", "yes"], [2, 1, "fair", "yes"], [2, 1, "excellent", "no"], [2, 1, "excellent", "yes"], [1, 2, "fair", "no"], [1, 1, "fair", "yes"], [2, 2, "fair", "yes"], [1, 2, "excellent", "yes"], [2, 2, "excellent", "yes"], [2, 3, "fair", "yes"], [2, 2, "excellent", "no"], [2, 3, "fair", "yes"]] mypy = MyPyTable(iphone_col_names, iphone_table) y2 = myutils.get_mypycol(mypy, "buys_iphone") nb2 = MyNaiveBayesClassifier() nb2.fit(iphone_table, y2) assert nb2.priors == [["no", 1 / 3], ["yes", 2 / 3]] nb2_posts = [[ 0, ['no', ['1', 3 / 15], ['2', 2 / 15]], ['yes', ['1', 2 / 15], ['2', 8 / 15]] ], [ 1, ['no', ['3', 2 / 15], ['2', 2 / 15], ['1', 2 / 3]], ['yes', ['3', 3 / 15], ['2', 4 / 15], ['1', 3 / 15]] ], [ 2, ['no', ['fair', 2 / 15], ['excellent', 3 / 15]], ['yes', ['fair', 7 / 15], ['excellent', 3 / 15]] ], [ 3, ['no', ['no', 1 / 3], ['yes', 0.0]], ['yes', ['no', 0.0], ['yes', 2 / 3]] ]] # assert nb2.posteriors == nb2_posts # Bramer 3.2 train dataset train_col_names = ["day", "season", "wind", "rain", "class"] train_table = [["weekday", "spring", "none", "none", "on time"], ["weekday", "winter", "none", "slight", "on time"], ["weekday", "winter", "none", "slight", "on time"], ["weekday", "winter", "high", "heavy", "late"], ["saturday", "summer", "normal", "none", "on time"], ["weekday", "autumn", "normal", "none", "very late"], ["holiday", "summer", "high", "slight", "on time"], ["sunday", "summer", "normal", "none", "on time"], ["weekday", "winter", "high", "heavy", "very late"], ["weekday", "summer", "none", "slight", "on time"], ["saturday", "spring", "high", "heavy", "cancelled"], ["weekday", "summer", "high", "slight", "on time"], ["saturday", "winter", "normal", "none", "late"], ["weekday", "summer", "high", "none", "on time"], ["weekday", "winter", "normal", "heavy", "very late"], ["saturday", "autumn", "high", "slight", "on time"], ["weekday", "autumn", "none", "heavy", "on time"], ["holiday", "spring", "normal", "slight", "on time"], ["weekday", "spring", "normal", "none", "on time"], ["weekday", "spring", "normal", "slight", "on time"]] mypy2 = MyPyTable(train_col_names, train_table) y3 = myutils.get_mypycol(mypy2, "class") nb3 = MyNaiveBayesClassifier() nb3.fit(iphone_table, y3)
def load_data(filename): mypytable = MyPyTable() mypytable.load_from_file(filename) return mypytable
#heavily based on app from class import pickle from mysklearn.myclassifiers import MyRandomForestClassifier from mysklearn.mypytable import MyPyTable import os fname = os.path.join("input_data", "tracks_data_backup.txt") tracks = MyPyTable().load_from_file(fname) Danceability = tracks.get_column('danceability') Energy = tracks.get_column('energy') Acousticness = tracks.get_column('acousticness') Valence = tracks.get_column('valence') y_train = Acousticness x_train = [[Danceability[i], Energy[i], Valence[i]] for i in range(len(y_train))] rf = MyRandomForestClassifier() rf.fit(x_train, y_train, 20, 7, 2) rf = MyRandomForestClassifier() rf.fit(x_train, y_train, 30, 4, 2) # serialize to file (pickle) outfile = open("trees.p", "wb") pickle.dump(rf.trees, outfile) outfile.close() # deserialize to object (unpickle) infile = open("trees.p", "rb") trees2 = pickle.load(infile) infile.close()
from mysklearn.mypytable import MyPyTable # Object Declaration table = MyPyTable() # Trims the Dataset (Gets Data Based on City) city = "Sydney" table.load_from_file("weatherAUS.csv") table.column_names[0] = 'Location' names, tables = table.group_by("Location") city_index = names.index(city) print("\n") for i in range(10): print(tables[city_index][i]) table.data = tables[city_index] table.save_to_file(city+"_weather.csv")