Example #1
0
should_remove_outliers = False

properties_to_use = ['Critic_Score', 'User_Score', 'Year_of_Release']
property_to_predict = 'Has_Great_Sales'
test_size = 0.33
k_nearest_neighbors = 5

random_state = 42  # Random seed for generating the test size

import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from GameDataCleaning import GameDataCleaning

dataset = GameDataCleaning.get_data(should_recompute, use_clean_data,
                                    unique_games_only, should_remove_outliers)

# create design matrix X and target vector y
X = np.array(dataset[properties_to_use].copy())
y = np.array(dataset[property_to_predict].copy())
print(len(X))
print(len(y))

# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=test_size,
                                                    random_state=random_state)

knn = KNeighborsClassifier(n_neighbors=k_nearest_neighbors)
Example #2
0
        theGreatest = 3
    return theGreatest


def criticScoreRound(creditscore):
    if creditscore >= 80: creditscore = 1
    elif creditscore >= 60: creditscore = 2
    else: creditscore = 3
    return creditscore


def userScoreRounded(userScore):
    if userScore >= 8: userScore = 1
    elif userScore >= 6: userScore = 2
    else: userScore = 3
    return userScore


if __name__ == '__main__':
    dataset = GameDataCleaning.get_data()

    dataset = dataset.dropna(axis=0, how='any')
    dataset = dataset[dataset.RatingCode != -1]

    dataset['continent_greatest_sales'] = dataset.apply(
        lambda x: greatest(x['NA_Sales'], x['EU_Sales'], x['JP_Sales']),
        axis=1)

    dataset.to_csv("theGames", sep=",", encoding='utf-8', index=False)

    print("This is the end")
Example #3
0
    def cluster(self):
        should_recompute = False
        use_clean_data = True
        unique_games_only = True
        should_remove_outliers = False

        number_of_clusters = 7
        init_type = 'k-means++'  #k-means++ or random #those are the options
        cluster_run_times = 20  #How many times to run the clustering
        max_iterations = 300  #How many iterations each clustering run should do (max)
        random_state = 42  #Random seed used to initialize the centers
        algorithm_to_use = 'auto'  #auto, full, elkan #those are the options
        verbose = 1  #Verbosity mode

        # MUST BE 3 AND NORMALIZED!
        properties_to_cluster = [
            'Publisher', 'Total_Publisher_Games',
            'Average_Publisher_Sales_Normalized',
            'Average_Publisher_Score_Normalized'
        ]
        # MUST BE SAME 3 PROPERTIES, NOT NORMALIZED VERSIONS!
        properties_to_show = [
            'Total_Publisher_Games', 'Average_Publisher_Sales',
            'Average_Publisher_Score'
        ]
        # Must have the same size as the number of clusters
        colors = [
            'green', '#6f6f6f', 'red', 'blue', 'purple', '#FF530D', 'yellow',
            'teal', 'black'
        ]

        # Get the cleaned data
        dataset = GameDataCleaning.get_data(should_recompute, use_clean_data,
                                            unique_games_only,
                                            should_remove_outliers)

        dataset = dataset.drop_duplicates('Publisher', keep='first').copy()
        dataset['Total_Publisher_Games'] = dataset.apply(
            lambda x: np.log2(x['Total_Publisher_Games']), axis=1)

        # Get the attributes we want to use to cluster with
        trim_dataframe = dataset[properties_to_cluster].copy()
        min = trim_dataframe['Total_Publisher_Games'].min()
        max = trim_dataframe['Total_Publisher_Games'].max()
        trim_dataframe['Total_Publisher_Games'] = trim_dataframe.apply(
            lambda x: (x['Total_Publisher_Games'] - min) / (max - min), axis=1)

        trim_dataframe = trim_dataframe.reset_index(drop=True)
        trim_dataframe2 = trim_dataframe[[
            'Total_Publisher_Games', 'Average_Publisher_Sales_Normalized',
            'Average_Publisher_Score_Normalized'
        ]].copy()
        # Convert to a matrix
        cluster_data = trim_dataframe2.as_matrix()

        # Run KMeans
        kmeans = KMeans(n_clusters=number_of_clusters,
                        init=init_type,
                        n_init=cluster_run_times,
                        max_iter=max_iterations,
                        random_state=random_state,
                        verbose=verbose,
                        copy_x=True,
                        algorithm=algorithm_to_use)
        kmeans.fit(cluster_data)

        # Initialize a figure
        plt.figure(figsize=(10, 10))

        colormap = np.array(colors)

        # Make a figure of the publishers based on the attribute we generated
        publisher_types = dataset['Publisher_Type']
        '''trace = go.Scatter3d(
            x = dataset[properties_to_show[0]],
            y = dataset[properties_to_show[1]],
            z = dataset[properties_to_show[2]],
            #z = cluster_data[:, 2],
            mode = 'markers',
            marker = dict(
                color = colormap[publisher_types]
            ),
            text = dataset['Publisher']
        )
        #plot([trace], filename='clusters.html')
        
        # Make a figure of the publishers based on the clusters
        predY = np.choose(kmeans.labels_, [4, 5, 1, 3, 2, 0, 6]).astype(np.int64)
        trace = go.Scatter3d(
            x = dataset[properties_to_show[0]],
            y = dataset[properties_to_show[1]],
            z = dataset[properties_to_show[2]],
            mode = 'markers',
            marker = dict(
                color = colormap[predY]
            ),
            text = dataset['Publisher']
        )
            
        centroids = kmeans.cluster_centers_
        for center in centroids:
            center[0] = ((center[0] + min) * (max - min)) # the x
            center[1] = center[1] * dataset[properties_to_show[1]].max() # the y
            center[2] = center[2] * dataset[properties_to_show[2]].max() # the z
            
        centroidTrace = go.Scatter3d(
            x = centroids[:, 0], 
            y = centroids[:, 1], 
            z = centroids[:, 2],
            marker = dict(
                size = 10,
                color = 'rgb(255, 255, 255)',
                symbol='x',
                line = dict(
                    width = 2,
                )
            )
        )
        
        data = [trace, centroidTrace]
        layout = go.Layout(showlegend=False)
        fig = go.Figure(data=data, layout=layout)
        # Create the graph and show it in a browser
        plot(fig, filename='k-means-results.html')
        
        print(sm.accuracy_score(publisher_types, predY))
        print(sm.confusion_matrix(publisher_types, predY))
        '''
        predY = np.choose(kmeans.labels_,
                          [4, 5, 1, 3, 2, 0, 6]).astype(np.int64)

        bin = pd.DataFrame(predY)
        binTranslated = pd.DataFrame()
        binTranslated['Publisher'] = trim_dataframe['Publisher']
        binTranslated['Publisher_Bin'] = bin[0]
        dictToRet = binTranslated.set_index(
            'Publisher')['Publisher_Bin'].to_dict()
        return dictToRet
Example #4
0
""" Generates clusters from a given .csv Uses 3 dimensional representation of attributes """
import plotly.plotly as plt
from GameDataCleaning import GameDataCleaning
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=False)

data = GameDataCleaning.get_data()

scatter = dict(
    mode='markers',
    name='Scatter',
    type='scatter3d',
    x=data['Publisher'],
    y=data['Year_of_Release'],
    z=data['Global_Sales'],
    marker=dict(size=2, color='blue', colorscale='Portland')
    # Pre-defined color scales -
    # 'pairs' | 'Greys' | 'Greens' | 'Bluered' | 'Hot' | 'Picnic' |
    # 'Portland' | 'Jet' | 'RdBu' | 'Blackbody' | 'Earth' | 'Electric' | 'YIOrRd' | 'YIGnBu'
)

clusters = dict(
    delaunayaxis="y",
    name='Cluster',
    opacity=0.5,
    type='mesh3d',
    x=data['Publisher'],
    y=data['Year_of_Release'],
    z=data['Global_Sales'],
)