Beispiel #1
0
def get_ratings_genre(table, genre, rating):
    """Get list with ratings attached with given genre column

    Args:
        table(MyPyTable): given object of MyPyTable
        genre(string): Genre to search for in table
        rating(string): Service provider to pull from in get column

    Returns:
        list(list): list with ratings from each correctly found genre"""

    genre_col = MyPyTable.get_column(table, 'Genres', True)

    col = MyPyTable.get_column(table, rating, True)
    list = []
    for i in range(len(genre_col)):
        if genre in genre_col[i]:
            if rating == 'Rotten Tomatoes' and '%' in col[i]:
                col[i] = float(col[i].strip('%'))
            list.append(col[i])

    copy_list = copy.deepcopy(list)

    for value in list:
        if value == '':
            copy_list.remove(value)

    list = copy_list

    return list
def scatter_plot(table, x_column_name, y_column_name):
    """Creates a scatter plot with given data

    Args:
        table(MyPyTable): given table to perform operation
        column_name(string): column name to get column from for scatter plot. Column on the x axis
        y_column_name(string): column name to get column from for scatter plot. Column on the y axis
    
    Returns:
        coeficient(float): coeficient value
        cov(float): covariance value
    """
    y_col = MyPyTable.get_column(table, y_column_name, False)
    x_col = MyPyTable.get_column(table, x_column_name, False)

    coeficient = utils.correlation_coeficient(x_col, y_col)
    cov = utils.covariance(x_col, y_col)

    m, b = utils.compute_slope_intercept(x_col, y_col)
    plt.scatter(x_col, y_col)
    plt.plot([min(x_col), max(x_col)],
             [m * min(x_col) + b, m * max(x_col) + b],
             c="r",
             label="corr: " + str(coeficient) + ", cov: " + str(cov))
    plt.legend()
    plt.plot()
    plt.show()

    return coeficient, cov
Beispiel #3
0
def get_sea_frequencies(MyPyTable, col_name):
    """Gets the frequency and count of a column by name

    Args:
        MyPyTable(MyPyTable): self of MyPyTable
        col_name(str): name of the column

    Returns:
        values, counts (string, int): name of value and its frequency"""

    rain_col = MyPyTable.get_column(col_name)
    row_index_to_drop = []
    print("range:", len(rain_col), len(MyPyTable.data))
    for i in range(len(rain_col)):
        if rain_col[i] == "FALSE":
            row_index_to_drop.append(i)
    
    count = 0
    row_to_drop = []
    for i in range(len(MyPyTable.data)):
        if i in row_index_to_drop:
            row_to_drop.append(MyPyTable.data[i])

    MyPyTable.drop_rows(row_to_drop)
    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    month_col = MyPyTable.get_column('DATE')
    yes_col = []
    for month in months:
        yes = 0
        for i in range(len(month_col)):
            if month in month_col[i]:
                yes = yes + 1
        yes_col.append(yes)

    return months, yes_col
def test_random_forest_fit():
    interview_header = ["level", "lang", "tweets", "phd", "interviewed_well"]
    interview_table = [["Senior", "Java", "no", "no", "False"],
                       ["Senior", "Java", "no", "yes", "False"],
                       ["Mid", "Python", "no", "no", "True"],
                       ["Junior", "Python", "no", "no", "True"],
                       ["Junior", "R", "yes", "no", "True"],
                       ["Junior", "R", "yes", "yes", "False"],
                       ["Mid", "R", "yes", "yes", "True"],
                       ["Senior", "Python", "no", "no", "False"],
                       ["Senior", "R", "yes", "no", "True"],
                       ["Junior", "Python", "yes", "no", "True"],
                       ["Senior", "Python", "yes", "yes", "True"],
                       ["Mid", "Python", "no", "yes", "True"],
                       ["Mid", "Java", "yes", "no", "True"],
                       ["Junior", "Python", "no", "yes", "False"]]
    myutils.prepend_attribute_label(interview_table, interview_header)

    interview_pytable = MyPyTable(column_names=interview_header,
                                  data=interview_table)
    y_col = interview_pytable.get_column("interviewed_well", False)
    x_cols = interview_pytable.drop_col("interviewed_well")

    many_trees = MyRandomForestClassifier()
    X_sample, y_sample = myutils.compute_bootstrapped_sample(x_cols, y_col)
    X_train, X_test, y_train, y_test = myutils.train_test_split(
        X_sample, y_sample, .33)
    many_trees.fit(X_train, y_train, X_test, y_test)
    y_predicted = many_trees.predict(X_test)

    numCorrectPredictions = 0
    numWrongPredictions = 0
    for i in range(len(y_test)):
        values = [y_predicted[i], y_test[i]]  #predicted/actual
        if (values[0] == values[1]):
            numCorrectPredictions = numCorrectPredictions + 1
        else:
            numWrongPredictions = numWrongPredictions + 1

    accuracy = np.round((numCorrectPredictions) /
                        (numCorrectPredictions + numWrongPredictions), 3)
    error_rate = np.round(
        (numWrongPredictions) / (numCorrectPredictions + numWrongPredictions),
        3)

    print("-----------------------------------------------------------")
    print("Accuracy and Error Rate")
    print("-----------------------------------------------------------")
    print()
    print("Random Forest: accuracy = {}, error rate = {}".format(
        accuracy, error_rate))
    print()
    print(
        "Because of the random aspect of this classifier, this will not always pass the tests"
    )
    print()
    print("Predicted table: " + str(y_predicted))
    print("Testing set:     " + str(y_test))
    for i in range(len(y_test)):
        assert y_predicted[i] == y_test[i]
Beispiel #5
0
def get_aus_frequencies(MyPyTable, col_name):
    """Gets the frequency and count of a column by name

    Args:
        MyPyTable(MyPyTable): self of MyPyTable
        col_name(str): name of the column

    Returns:
        values, counts (string, int): name of value and its frequency"""

    rain_col = MyPyTable.get_column("RainToday")
    row_index_to_drop = []
    for i in range(len(rain_col)):
        if rain_col[i] == "No":
            row_to_drop.append(i)
    row_to_drop = []
    for i in range(len(MyPyTable.data)):
        if i in row_index_to_drop:
            row_to_drop.append(i)

    table = MyPyTable.drop_rows(rows_to_drop)
    table.pretty_print()
    return table
    values = []
    counts = []
    '''for value in col:
Beispiel #6
0
def unique_genres(table):
    """Get list of unique genres within a table

    Args:
        table(MyPyTable): given object of MyPyTable

    Returns:
        values(list): list with unique genres"""
    genre_str = ''
    genre_col = MyPyTable.get_column(table, 'Genres', False)
    vals, counts = get_frequencies(table, 'Genres')
    for v in vals:
        genre_str = genre_str + v + ','
    genre_array = genre_str.split(',')

    values = []

    for value in genre_array:
        if value != '':
            if value not in values:
                # haven't seen this value before
                values.append(value)
            elif value in values:
                pass
    return values
Beispiel #7
0
def test_random_forest_classifier_fit():
    mp_table = MyPyTable(interview_header, interview_table)
    # Formulate X_train and y_train
    y_train = mp_table.get_column('interviewed_well')
    X_train_col_names = ["level", "lang", "tweets", "phd"]
    X_train = mp_table.get_rows(X_train_col_names)

    myRF = MyRandomForestClassifier(N=4, M=2, F=4)
    myRF.fit(X_train, y_train)

    assert len(myRF.M_attr_sets) == myRF.M
Beispiel #8
0
def bagging(X, Y, N, M, F):
    # 1. split your dataset into a test set and a "remainder set"
    x_remainder, x_test, y_r, y_test = myevaluation.train_test_split(X, Y)
    # 2. using the remainder set, sample N bootsrap samples and use each one to build a classifier
    #    for each N sample:
    #        ~63% of the remainder set will be sampled into training set
    #        ~37% will be leftover for this tree's validation set
    forest = []
    # accuracies = [[0] for i in range(N)]
    accuracies = {}
    for i in range(N):
        x_train, y_train = compute_bootstrapped_sample(
            x_remainder, y_r)  #get the bootstrap sample
        tree = my_class.MyDecisionTreeClassifier()
        tree.fit(x_train, y_train, True, F)  #build classifier
        # get remainder of x_train and use as validation set
        x_v = []
        y_v = []
        for j in range(len(x_remainder)):
            if x_remainder[j] not in x_train:
                x_v.append(x_remainder[j])
                y_v.append(y_r[j])
        pred = tree.predict(x_v)
        accuracy = get_accuracy(y_v, pred)
        accuracies[str(i)] = accuracy  # {i: accuracy, }
        forest.append(tree)

# 3. measure the performance of the tree on the validation set and select the best M of N
#   trees based on the performance metrics
    best_trees_dict = best_M(M, accuracies)
    best_trees = []
    for key in best_trees_dict:
        best_trees.append(forest[int(key)])
# 4. using majority voting, make predictions from the M learners for each instance in the test set
    all_predictions = []  # [[predictions1],[predictions2]...]
    for tree in best_trees:
        pred = tree.predict(x_test)
        all_predictions.append(pred)  #think about this like flipping a table
    #get the majority for every single row
    pred_header = build_header(
        all_predictions)  #turn all predictions into a mypy
    pred_mypy = MyPyTable(pred_header, all_predictions)
    voted_predictions = []
    for i in range(
            len(all_predictions[0])
    ):  #loop through every x_test, create a column of predictions, pick the pred by majority rule
        pred_col = pred_mypy.get_column(i)
        vals, counts = get_freq_str(pred_col)
        j = counts.index(max(counts))
        y_predict = vals[j]
        voted_predictions.append(y_predict)

    forest_accuracy = get_accuracy(y_test, voted_predictions)
    return best_trees, voted_predictions, forest_accuracy
def hist_graph(table, column_name):
    """Creates a histogram graph with given data

    Args:
        table(MyPyTable): given table to perform operation
        column_name(string): column name to get column from for hist graph

    """
    col = MyPyTable.get_column(table, column_name, False)

    plt.hist(col, bins=10)
    plt.show()
def pie_chart_dataPrep(table, cols_to_plot):
    """
    """
    totals_list = []
    for col_name in cols_to_plot:
        column_Category = MyPyTable.get_column(table, str(col_name))

        total = 0
        for val in column_Category:
            total = total + float(val)
        totals_list.append(total)
    return totals_list
Beispiel #11
0
def get_year_counts(table, platform):
    """Get years of occuring platform game occurences along with their individual frequencies

    Args:
        table(MyPyTable): given object of MyPyTable
        platform(string): platform to search for in table

    Returns:
        values, counts (string, int): name of value and its frequency"
        """

    plat_col = MyPyTable.get_column(table, 'Platform', True)
    col = MyPyTable.get_column(table, "Year", True)
    list = []
    for i in range(len(plat_col)):
        if plat_col[i] == platform:
            list.append(col[i])

    copy_list = copy.deepcopy(list)

    for value in list:
        if value == 'N/A':
            copy_list.remove(value)

    list = copy_list
    list.sort()

    values = []
    counts = []

    for value in list:
        if value not in values:
            # haven't seen this value before
            values.append(value)
            counts.append(1)
        elif value in values:
            index = values.index(value)
            counts[index] += 1

    return values, counts
Beispiel #12
0
def convert_attributes(table):
    """Converts IMDb to double digit float and Rotten Tomatoes to string without %

    Args:
        table(MyPyTable): given object of MyPyTable

    Returns:
        imbd_col(list): IMDb list in double digits
        rotten_col(list): Rotten Tomatoes list stripped of %"""
    #IMDb conversion
    col = MyPyTable.get_column(table, 'IMDb', False)
    rotten_col = MyPyTable.get_column(table, 'Rotten Tomatoes', False)
    imbd_col = []
    for i in col:
        i = i * 10
        imbd_col.append(i)

    #rotten tomatoes conversion
    for a, x in enumerate(rotten_col):
        rotten_col[a] = float(x[:-1])

    return imbd_col, rotten_col
Beispiel #13
0
def get_mpg_frequencies(MyPyTable, col_name):
    """Gets the frequency and count of a column by name

    Args:
        MyPyTable(MyPyTable): self of MyPyTable
        col_name(str): name of the column

    Returns:
        values, counts (string, int): name of value and its frequency"""

    col = MyPyTable.get_column(col_name)

    values = []
    counts = []

    for value in col:
        if value not in values:
            # haven't seen this value before
            if value >= 13 and value < 14:
                values, counts = mpg_val_check(1, values, counts, value)
            elif value == 14:
                values, counts = mpg_val_check(2, values, counts, value)
            elif value > 14 and value <= 16:
                values, counts = mpg_val_check(3, values, counts, value)
            elif value > 16 and value <= 19:
                values, counts = mpg_val_check(4, values, counts, value)
            elif value > 19 and value <= 23:
                values, counts = mpg_val_check(5, values, counts, value)
            elif value > 23 and value <= 26:
                values, counts = mpg_val_check(6, values, counts, value)
            elif value > 26 and value <= 30:
                values, counts = mpg_val_check(7, values, counts, value)
            elif value > 30 and value <= 36:
                values, counts = mpg_val_check(8, values, counts, value)
            elif value > 36 and value <= 44:
                values, counts = mpg_val_check(9, values, counts, value)
            elif value >= 45:
                values, counts = mpg_val_check(10, values, counts, value)

    temp_counts = copy.deepcopy(counts)

    #re-order/sort values and temp_counts
    for i in range(len(values)):
        index = values[i]
        temp_counts[index - 1] = counts[i]
    values.sort()
    counts = temp_counts

    return values, counts
Beispiel #14
0
def compute_entropy(instances, available_attributes, index):
    mypy = MyPyTable(available_attributes, instances)
    classes = mypy.get_column(-1)
    attributes = mypy.get_column(index)
    temp = set(attributes)
    __, tables = group_by(attributes, classes)
    totals = []
    sub_entropies = []
    # get the class counts here
    for jj, element in enumerate(temp):
        totals.append(attributes.count(element))
        # parallel array of counts of each att for each class
        arr = []
        for table in tables:
            arr.append(table.count(element))
        su = 0
        for kk in arr:
            if kk <= 0:
                pass
            else:
                su -= kk / totals[jj] * math.log2(kk / totals[jj])
        su *= totals[jj] / len(attributes)
        sub_entropies.append(su)
    return sum(sub_entropies)
def percent_hist_graph(table, column_name):
    """Creates a histogram graph with given data and removes the percent sign from given column_names

    Args:
        table(MyPyTable): given table to perform operation
        column_name(string): column name to get column from for hist graph

    """
    col = MyPyTable.get_column(table, column_name, False)

    for i, x in enumerate(col):
        col[i] = float(x[:-1])

    plt.hist(col, bins=10)
    plt.show()
Beispiel #16
0
def percentages_columns(table, column_names):
    """Gives the percentage of each column's frequency divided by total column length

    Args:
        table(MyPyTable): given object of MyPyTable
        column_names(list): List of string column names

    Returns:
        percentages(list): list of percentages in each correct index matching with the given columns list"""
    counts = get_occurences_given_columns(table, column_names)
    percentages = []
    col = MyPyTable.get_column(table, column_names[0], False)
    length = len(col)
    for count in counts:
        percentages.append(round((count / length) * 100, 0))
    return percentages
Beispiel #17
0
def test_My_Random_Forest_Classifier_fit():
    # Object Declarations
    # Tests with N = 3, M = 2, F = 2 and seed = 0
    rand_forest_test = MyRandomForestClassifier(3, 2, 2, 0)
    table = MyPyTable()

    # Variable Assignment and Declaration
    table.data = interview_table
    table.column_names = interview_header

    X_test = interview_table
    y_train = table.get_column("interviewed_well")

    # Tests on the Interview Dataset
    rand_forest_test.header = interview_header
    rand_forest_test.fit(X_test, y_train)

    trees = rand_forest_test.trees
Beispiel #18
0
def test_random_forest_classifier_predict():
    X_test = [["Mid", "Python", "no", "no", "True"],
              ["Mid", "R", "yes", "yes", "True"],
              ["Mid", "Python", "no", "yes", "True"]]

    y_test = ["True", "True", "True"]

    mp_table = MyPyTable(interview_header, interview_table)
    # Formulate X_train and y_train
    y_train = mp_table.get_column('interviewed_well')
    X_train_col_names = ["level", "lang", "tweets", "phd"]
    X_train = mp_table.get_rows(X_train_col_names)

    myRF = MyRandomForestClassifier(N=4, M=2, F=4)
    myRF.fit(X_train, y_train)
    predictions = myRF.predict(X_test)

    for i in range(0, len(predictions)):
        assert predictions[i] == y_test[i]
def random_forest_predict(X_test, trees):
# 4. using majority voting, make predictions from the M learners for each instance in the test set
    all_predictions = [] # [[predictions1],[predictions2]...]
    for tree in trees:
        pred = tree.predict(X_test)
        all_predictions.append(pred) #think about this like flipping a table
    #get the majority for every single row
    pred_header = build_header(all_predictions) #turn all predictions into a mypy
    pred_mypy = MyPyTable(pred_header, all_predictions)
    voted_predictions = []
    for i in range(len(all_predictions[0])): #loop through every x_test, create a column of predictions, pick the pred by majority rule
        pred_col = pred_mypy.get_column(i)
        vals, counts = get_freq_str(pred_col)
        j = counts.index(max(counts)) 
        y_predict = vals[j]
        voted_predictions.append(y_predict)

    # forest_accuracy = get_accuracy(y_test, voted_predictions)
    return voted_predictions
Beispiel #20
0
def get_occurences_given_columns(table, column_names):
    """Gets the occurence from each column in a given columns list.

    Args:
        table(MyPyTable): given object of MyPyTable
        column_names(list): List of string column names

    Returns:
        count(list): list of frequencies in each correct index matching with the given columns list"""
    column = []
    count = []
    for i in range(len(column_names)):
        count.append(0)

    for col in column_names:
        attributes = MyPyTable.get_column(table, col, False)
        column.append(attributes)
    for i in range(len(column)):
        for j in column[i]:
            if j == 1.0:
                count[i] = count[i] + 1

    return count
Beispiel #21
0
def get_frequencies(MyPyTable, col_name):
    """Gets the frequency and count of a column by name

    Args:
        MyPyTable(MyPyTable): self of MyPyTable
        col_name(str): name of the column

    Returns:
        values, counts (string, int): name of value and its frequency"""

    col = MyPyTable.get_column(col_name)
    values = []
    counts = []

    for value in col:
        if value not in values:
            # haven't seen this value before
            values.append(value)
            counts.append(1)
        elif value in values:
            index = values.index(value)
            counts[index] += 1

    return values, counts
from mysklearn.mypytable import MyPyTable
from mysklearn.myclassifiers import MyDecisionTreeClassifier, MyNaiveBayesClassifier
import mysklearn.myevaluation as myevaluation
import mysklearn.myutils as myutils
import os

# "pickle" an object (AKA object serialization)
# save a Python object to a binary file

# "unpickle" an object (AKA object de-serialization)
# load a Python object from a binary file (back into memory)

# Get data from csv file
table = MyPyTable().load_from_file(
    os.path.join("input_files", "winequality-red.csv"))
y_col = table.get_column("quality", False)
x_cols = table.drop_col("quality")

# Use Naive Bayes to classify
testcase = MyNaiveBayesClassifier()

#Returns x INDEXES
X_train, X_test = myevaluation.stratified_kfold_cross_validation(x_cols,
                                                                 y_col,
                                                                 n_splits=10)
X_train, X_test, y_train, y_test = myutils.getInstances(
    X_train, X_test, x_cols, y_col)

for i, fold in enumerate(X_train):
    train, test = myutils.normalize_values(X_train[i], X_test[i])
    testcase.fit(train, y_train[i])
Beispiel #23
0
from mysklearn.myclassifiers import MyKNeighborsClassifier
import os
from mysklearn.mypytable import MyPyTable
import mysklearn.myevaluation as myeval
import mysklearn.myutils as myutils
import pickle

# Importing the data and table and cols
movies_fname = os.path.join("input_data", "movies.csv")
# movie_data = MyPyTable().load_from_file_no_encode(movies_fname)
movies_table = MyPyTable().load_from_file(movies_fname, encode='cp1252')

# Getting profit
gross_profit = [
    movies_table.get_column('gross')[i] - movies_table.get_column('budget')[i]
    for i in range(len(movies_table.data))
]
profitted = [0 if gross < 0 else 1 for gross in gross_profit]
movies_table.add_column(profitted, 'profitted')

# fit the KNN algorithm to the movies data
kn_class = MyKNeighborsClassifier()
feature_cols = [
    'budget', 'votes', 'genre', 'rating', 'score', 'star', 'director', 'writer'
]
features = movies_table.get_key_columns(feature_cols)
outcomes = profitted
kn_class.fit(features, outcomes)

packaged_object = kn_class
#heavily based on app from class
import pickle
from mysklearn.myclassifiers import MyRandomForestClassifier
from mysklearn.mypytable import MyPyTable
import os

fname = os.path.join("input_data", "tracks_data_backup.txt")
tracks = MyPyTable().load_from_file(fname)

Danceability = tracks.get_column('danceability')
Energy = tracks.get_column('energy')
Acousticness = tracks.get_column('acousticness')
Valence = tracks.get_column('valence')

y_train = Acousticness
x_train = [[Danceability[i], Energy[i], Valence[i]]
           for i in range(len(y_train))]

rf = MyRandomForestClassifier()
rf.fit(x_train, y_train, 20, 7, 2)
rf = MyRandomForestClassifier()
rf.fit(x_train, y_train, 30, 4, 2)
# serialize to file (pickle)
outfile = open("trees.p", "wb")
pickle.dump(rf.trees, outfile)
outfile.close()

# deserialize to object (unpickle)
infile = open("trees.p", "rb")
trees2 = pickle.load(infile)
infile.close()
Beispiel #25
0
from mysklearn.myclassifiers import MyNaiveBayesClassifier
import os
from mysklearn.mypytable import MyPyTable
import mysklearn.myevaluation as myevaluation
import mysklearn.myutils as myutils
import pickle

fname = os.path.join("input_data", "collisions.csv")
collisions_data = MyPyTable().load_from_file(fname)

weather = collisions_data.get_column('WEATHER')
road_condition = collisions_data.get_column('ROADCOND')
light_condition = collisions_data.get_column('LIGHTCOND')
junction_type = collisions_data.get_column('JUNCTIONTYPE')
severity = collisions_data.get_column('SEVERITYDESC')

X_train = [[
    weather[i], road_condition[i], light_condition[i], junction_type[i],
    severity[i]
] for i in range(len(weather))]
y_train = collisions_data.get_column('COLLISIONTYPE')

for i, val in enumerate(y_train):
    if val == 'Unknown':
        del y_train[i]
        del X_train[i]

strattrain_folds, strattest_folds = myevaluation.stratified_kfold_cross_validation(
    X_train, y_train, 10)
strat_xtrain, strat_ytrain, strat_xtest, strat_ytest = myutils.get_from_folds(
    X_train, y_train, strattrain_folds, strattest_folds)