def reusing_code_random_forest_on_iris() -> Dict: """ Again I will run a classification on the iris dataset, but reusing the existing code from assignment1. Use this to check how different the results are (score and predictions). """ df = read_dataset(Path('..', '..', 'iris.csv')) for c in list(df.columns): # Notice that I am now passing though all columns. # If your code does not handle normalizing categorical columns, do so now (just return the unchanged column) df = fix_outliers(df, c) df = fix_nans(df, c) df[c] = normalize_column(df[c]) X, y = df.iloc[:, :4], df.iloc[:, 4] le = generate_label_encoder(y) # Be careful to return a copy of the input with the changes, instead of changing inplace the inputs here! y_encoded = replace_with_label_encoder(y.to_frame(), column='species', le=le) rf = simple_random_forest_classifier(X, y_encoded['species']) ''' !!Explanation!! Both the classifier in this function and the one in the last yield just about the same score on average I believe this is because the two datasets are essentially the same at this point: They both have label encoded classes The only difference is this function removed nans and outliers, which the dataset does not possess many of anyway And also normalizes the dataset, which from what my understanding might not actually change the values in relation to other values. This normalization may just make the model in this function more efficient! Due to this potential boost in efficiency due to normalization, I would choose this function's model over the last ''' print(rf['accuracy']) return rf
def plot_first_graph(n_clicks, dataset_name, x_column, y_column, graph_type): if n_clicks == None: return go.Figure(), 'Rows Scanned : 0' if dataset_name == 'iris': df = iris_df rows_count = len(iris_df) elif dataset_name == 'video_game': df = video_game_df rows_count = len(video_game_df) elif dataset_name == 'life_expectancy': df = life_expectancy_df rows_count = len(life_expectancy_df) else: df = None categorical_cols = get_text_categorical_columns(df) if x_column in categorical_cols: le = generate_label_encoder(df[x_column]) df = replace_with_label_encoder(df, x_column, le) if y_column in categorical_cols: le = generate_label_encoder(df[y_column]) df = replace_with_label_encoder(df, y_column, le) if graph_type == 'scatter': first_figure = px.scatter(df, x=x_column, y=y_column) elif graph_type == 'histogram': first_figure = px.histogram(df, x=x_column, color=y_column) elif graph_type == 'polar': first_figure = px.scatter_polar(df, r=x_column, theta=y_column) else: first_figure = None final_rows_call = 'Rows Read : ' + str(rows_count) return first_figure, final_rows_call
def reusing_code_random_forest_on_iris() -> Dict: """ Again I will run a classification on the iris dataset, but reusing the existing code from assignment1. Use this to check how different the results are (score and predictions). """ df = read_dataset(Path('..', '..', 'iris.csv')) for c in list(df.columns): # Notice that I am now passing though all columns. # If your code does not handle normalizing categorical columns, do so now (just return the unchanged column) df = fix_outliers(df, c) df = fix_nans(df, c) df[c] = normalize_column(df[c]) X, y = df.iloc[:, :4], df.iloc[:, 4] le = generate_label_encoder(y) # Be careful to return a copy of the input with the changes, instead of changing inplace the inputs here! y_encoded = replace_with_label_encoder(y.toframe(), column='species', le=le) return simple_random_forest_classifier(X, y_encoded['species'])
def iris_clusters() -> Dict: """ Let's use the iris dataset and clusterise it: """ df = pd.read_csv(Path('..', '..', 'iris.csv')) for c in list(df.columns): df = fix_outliers(df, c) df = fix_nans(df, c) df[c] = normalize_column(df[c]) # Let's generate the clusters considering only the numeric columns first no_species_column = simple_k_means(df.iloc[:, :4]) ohe = generate_one_hot_encoder(df['species']) df_ohe = replace_with_one_hot_encoder(df, 'species', ohe, list(ohe.get_feature_names())) # Notice that here I have binary columns, but I am using euclidean distance to do the clustering AND score evaluation # This is pretty bad no_binary_distance_clusters = simple_k_means(df_ohe) # Finally, lets use just a label encoder for the species. # It is still bad to change the labels to numbers directly because the distances between them does not make sense le = generate_label_encoder(df['species']) df_le = replace_with_label_encoder(df, 'species', le) labeled_encoded_clusters = simple_k_means(df_le) # See the result for yourself: print(no_species_column['score'], no_binary_distance_clusters['score'], labeled_encoded_clusters['score']) ret = no_species_column if no_binary_distance_clusters['score'] > ret['score']: print('no binary distance') ret = no_binary_distance_clusters if labeled_encoded_clusters['score'] > ret['score']: print('labeled encoded') ret = labeled_encoded_clusters return ret
def train_life_expectancy() -> Dict: """ Do the same as the previous task with the result of the life expectancy task of e_experimentation. The label column is the value column. Remember to convert drop columns you think are useless for the machine learning (say why you think so) and convert the remaining categorical columns with one_hot_encoding. Feel free to change your e_experimentation code (changes there will not be considered for grading purposes) to optimise the model (e.g. score, parameters, etc). """ df = process_life_expectancy_dataset() ''' !!!Explanation!!! This code below is interesting, and it aims to fix several Nan values that existed as expectancy values I originally intended to replace the expectancy with the mean of the column as I did in the classification file, BUT this cause the issue of a large number of data instances having the same target value, this resulted in regressors with very low scores. This wasn't good so I needed to come up with a new solution and I decided to drop each row which had a Nan expectancy value, and simply work with the data that we had. You'll see that this resulted in a model with a very high score, and I will discuss it more in the results section. Additionally this drops various instances which had year set to Nan, I did not think I could represent date by replacing the Nan values with anything from the other instances of the Year column, so I would have dropped those values anyway. ''' df['expectancy'].fillna(value=0, inplace=True) df = df[df['expectancy'] != 0] print(df) le = generate_label_encoder(df['name']) df = replace_with_label_encoder(df, 'name', le) print(df) X = df X = X.drop(['expectancy'], axis=1) y = df['expectancy'] rf = simple_random_forest_regressor(X=X, y=y) print(rf) dt = decision_tree_regressor(X=X, y=y) print(dt) ''' !!!My Results!!! The models perform very well! BUT I think this may be due to the model being overfit. It seems that there is some risk of contamination with this data set. There are many instances which are essentially the same, except for the year value. These values would risk being split into both the training and testing dataset, and the model might train and learn one instance just to be tested on what is a very, very similar version of that instance. It is like the model is cheating! And I do not predict it would do so well on new, real life data. In order to reduce risk of overfitting, while also not removing any more instances, I would collect more data in order to include information specific to the year and the country, this way the model would have more information about the demographics and conditions the country held and could perhaps find patterns to predict life expectancy that way. To quickly conclude my observation on dropping expectancy values from before, I do not think setting the expectancy values should have been set to the mean as they were what was being predicted, and by extracting a value for the prediction that way causes the model to misrepresent the real data I think. In order to include the dropped data in future tests, we would simply have to find the actual data for those instances, as I do not believe we should be using false values for such an important feature! To do this, we could search through government papers or statistical analyses for the target country. Once we have the complete data profile of that country, then I would feel more comfortable adding it back into the data set. ''' if rf['score'] > dt['score']: print('random forest wins!') return rf else: print('decision tree wins!') return dt
def train_amazon_video_game() -> Dict: """ Run the result of the amazon dataset task of e_experimentation using the decision tree regressor AND random_forest regressor. Return the one with lowest R^2. The Label column is the count column Discuss (1 sentence) what you found different between the results. In one sentence, why is the score different (or the same) compared to the iris score? Feel free to change your e_experimentation code (changes there will not be considered for grading purposes) to optimise the model (e.g. score, parameters, etc). """ df = process_amazon_video_game_dataset() print(df) ''' !!!Explanation!!! I used this same logic in the classification file, but we would need many and more bits to represent a one hot encoding of all of the movie names, so instead I just label encode them, though they still represent the same data ''' le = generate_label_encoder(df['asin']) df = replace_with_label_encoder(df, 'asin', le) ''' !!!Explanation!!! Note: this is the same explanation I gave for dropping the time column in the classification file, but I thought I would restate it here just in case. I decided to drop the time column as I personally don't think it will have a correlation with the target labels. The time only seems to indicate the activity of the user, which is easily updates once the user reviews again. Thus, my theory is that the model might learn to check when a user is active, which could overfit the model if user activity is somewhat random. For example, if they reviewed a video game that came out today, after not reviewing one after 10 years, the model may not predict the user because it is biased to the activity dates. Sometimes sequels to games come out after a long, long time as any video game fan knows, and perhaps a player might want to review the newest sequel of a game series they used to like to review. I believe the model should be able to predict the user from other features relating to the users rating behaviours, but should be independent of time, as there are no set rules to when a user might review ''' df = df.drop('time', axis=1) X, y = df.iloc[:, 1:], df.iloc[:, 0] rf = simple_random_forest_regressor(X, y) print(rf) dt = decision_tree_regressor(X, y) print(dt) ''' !!!My Results!!! Using the original data with little processing other than encoding, we get a models that have scores of around 0.9999 on average. This may look very good but I don't think there is any way this could be realistic. I think there are two pretty big issues with this data set, both of which I discuss in the classification file but they seem even more apparent here so I will reiterate my analysis. Issue #1: Unique Labels. - A unique label refers to an instance of data in the data set that possesses a label, which no other instances in that dataset possess. I believe this is an issue in regression and classification as if that instance is put in a test set, the label will not be learned by the model. This becomes an issue as it will be automatically wrongly predicted. - I believe a solution to this would be to simply choose better labels; ones that can describe the data in a more general way. I believe the count feature is problematic as it is somewhat trivial (you can just count the number of reviews for each user) and without an idea of that counting pattern, it will be somewhat of a guess! Issue #2: Data Duplication. - Although it seems this may contradict issue #1, I think there should be a nice mix of data that have the same labels for supervised learning, but are different, I think this is the basis of a balanced data set. - The issue here is that a lot of data instances in this data set are extremely similar to others, apart from the video game being reviewed. This can lead to overfitting of the model. - This leads to overfitting of the model as these duplicates can show up in both the training set AND the test set, leading to contamination of the sets. I've talked about this a lot throughout the assignment but the model will learn the instances in training, and then be tested on those same instances for the score function. This is really bad because the score now misrepresents the models ability to predict actually new data. - I don't think its as an easy fix as issue #1, but I think one solution may be more data collection; features that are not just unique to the class, but also unique to the individual instances must be collected! This data diversity in theory should reduce a lot of the contamination, and thus reduce the overfiting in the model. ''' ''' !!!Versus the Iris Data Set!!! I think the only thing to say here should be obvious from my responses to the iris function and this one. The big differences between these results is that there is evident overfitting going on with this data set, and not too much going on with the iris set. I believe the biggest differences of these data sets (if we were to somehow standardize them relative to each other) is that the iris data set is a lot more balanced, and does not contain as much potential for train/test contamination. I believe if you balanced out the data and fix the duplication issues (as listed above) in the Amazon data set, it could have the potential to have a model learn it just as well as it learns the iris data set. Although the Amazon data sets models had better scores, I believe that is just due to various instances of overfitting, and thus the more balanced Iris data set produced better models. ''' if rf['score'] < dt['score']: print('random forest wins!') return rf else: print('decision tree wins!') return dt
def your_choice() -> Dict: """ Now choose one of the datasets included in the assignment1 (the raw one, before anything done to them) and decide for yourself a set of instructions to be done (similar to the e_experimentation tasks). Specify your goal (e.g. analyse the reviews of the amazon dataset), say what you did to try to achieve the goal and use one (or both) of the models above to help you answer that. Remember that these models are classification models, therefore it is useful only for categorical labels. We will not grade your result itself, but your decision-making and suppositions given the goal you decided. Use this as a small exercise of what you will do in the project. """ ''' !!!My Goal!!! I will be using the dataset "Geography" With this dataset, I want to find out if we can fit a model to predict the World Bank Income Group of a country given a some geographical and bank related features To find this out, I will preprocess the data in the following ways: - Fix any missing data in the columns that are mentioned below - Extract and label encode the World Bank groups column into the labels vector - Extract and one hot encode World bank region column into the features vector - Extract latitude into the features vector - Extract longitude into the features vector I will train both a Decision Tree and Random Forest to find my goal, and return the model with the greater accuracy ''' df = pd.read_csv(Path('..', '..', 'geography.csv')) ''' !!!Explanation!!! The only columns with Nans for the target features for this were from the Vatican, so I replaced their null values with the values from Italy. I know they are technically separate, but until the data set can be filled we will simply consider them the same. ''' df['World bank region'].fillna(value='Europe & Central Asia', inplace=True) df['World bank, 4 income groups 2017'].fillna('High Income', inplace=True) le = generate_label_encoder(df_column=df['World bank, 4 income groups 2017']) df = replace_with_label_encoder(df=df, column='World bank, 4 income groups 2017', le=le) ohe = generate_one_hot_encoder(df_column=df['World bank region']) df = replace_with_one_hot_encoder(df=df, column='World bank region', ohe=ohe, ohe_column_names=ohe.get_feature_names()) columns = ['Latitude', 'Longitude', 'x0_East Asia & Pacific', 'x0_Europe & Central Asia', 'x0_Latin America & Caribbean', 'x0_Middle East & North Africa', 'x0_North America', 'x0_South Asia', 'x0_Sub-Saharan Africa'] X = df[columns] y = df['World bank, 4 income groups 2017'] dt = decision_tree_classifier(X=X, y=y) #print(dt) rf = simple_random_forest_classifier(X=X, y=y) #print(rf) ''' !!!My Results!!! It seems that once again on average the Decision Tree and Random Forest are yielding similar results. Their accuracies are quite low, and range from around 50 to nearly 70 percent accuracy. I don't think a lot of overfitting is occurring here, as the datasets are well balanced, and properly split into training and testing. The data set does have a lack of columns that relate to the economy, wealth, or demographics of the country, So I believe that more data may improve the model to fit a mapping between the demographic and wealth data of a given country, and its income group (target label). Features that could be collected as additional data columns could include things such as average income, employment rate, tax information, and more! I believe although this model is just a start, it could be beneficial to companies who are figuring out economic policies or tax plans. I believe, the ability to use this model while trying to come up with plans to benefit a country's economy could be useful, with enough relevant training and data :) ''' if rf['accuracy'] > dt['accuracy']: #print('random forest wins') return rf else: #print('decision tree wins') return dt
def train_life_expectancy() -> Dict: """ Do the same as the previous task with the result of the life expectancy task of e_experimentation. The label column is the column which has north/south. Remember to convert drop columns you think are useless for the machine learning (say why you think so) and convert the remaining categorical columns with one_hot_encoding. (check the c_regression examples to see example on how to do this one hot encoding) Feel free to change your e_experimentation code (changes there will not be considered for grading purposes) to optimise the model (e.g. score, parameters, etc). """ df = process_life_expectancy_dataset() ''' !!!Explanation!!! I dropped the year column as there are many and more Nan values within It is not really a value you can simply fix by average the columns that are not empty Logically that would not make sense, and I believe by doing that the year column would become misrepresented I do not predict this to affect accuracy all that much as year should not have that big of an impact on the classification of the country being in the north or south, as this function is doing ''' df = df.drop(['year'], axis=1) ''' !!!Explanation!!! The expectancy column also has a lot of Nan values, so I decided to replace those Nans with the average of that column. I believe this is appropriate as the life expectancy is probably around the same range for each country in this dataset, so taking the average of it is a good measure of the life expectancy for any country. Note: This hypothesis may not be great as the range of expectancy is quite large, from my preprocessing it will be around 75 years; but given that some countries are developing, as well as the data being from many years ago, for now I believe the mean can still give a better representation than nothing! ''' mean = get_column_mean(df, 'expectancy') df['expectancy'].fillna(value=mean, inplace=True) X = df X = X.drop(['latitude'], axis=1) y = df['latitude'] print(X) print(y) ''' !!! Explanation !!! I decided to label encode the country name I could not leave them as strings as the model would not be able to read it, and I think one hot encoding the names would be very space innificient as there are many different country names, and we would need a lot of bits to one hot encode them all! ''' le = generate_label_encoder(X['name']) X['name'] = le.fit_transform(X['name']) rf = simple_random_forest_classifier(X, y) dt = decision_tree_classifier(X, y) ''' !!!Explanation!!! Both the decision tree and the random forest are performing very well, both with ~.99 accuracy scores. From the results, both performed much better than any function we have classified before. I am inclined to believe that this data set has lead to some overfitting, due to an unbalanced dataset. The dataset for example, has the country Afghanistan many times, each attribute being the same as the year has been removed and many of the expectancy missing values are set to that columns mean. This introduces overfitting because the duplicate data instances may go into both the training and testing set, contamination!! This is not good as the model will be tested on things it already knows, giving it 100% on it almost automatically... kind of like the model is cheating on a test. Given a completely brand new data set, I think the models performance would drop. Due to this data imbalance, I don't think this dataset is that great to run classification on, even with all of the preprocessing. I believe a solution to this would be to of course balance out the data set, by collecting more information about other countries that are less represented in the dataset, as well as add dimensions that are not so redundant as missing or mean expectancies; perhaps more general features relating to the weather if we are still trying to predict if it is in the north or south. ''' if rf['accuracy'] > dt['accuracy']: print('random forest wins') return rf else: print('decision tree wins') return dt