def reusing_code_random_forest_on_iris() -> Dict:
    """
    Again I will run a classification on the iris dataset, but reusing
    the existing code from assignment1. Use this to check how different the results are (score and
    predictions).
    """
    df = read_dataset(Path('..', '..', 'iris.csv'))
    for c in list(df.columns):
        # Notice that I am now passing though all columns.
        # If your code does not handle normalizing categorical columns, do so now (just return the unchanged column)
        df = fix_outliers(df, c)
        df = fix_nans(df, c)
        df[c] = normalize_column(df[c])

    X, y = df.iloc[:, :4], df.iloc[:, 4]
    le = generate_label_encoder(y)

    # Be careful to return a copy of the input with the changes, instead of changing inplace the inputs here!
    y_encoded = replace_with_label_encoder(y.to_frame(), column='species', le=le)
    rf = simple_random_forest_classifier(X, y_encoded['species'])

    '''
    !!Explanation!!
    Both the classifier in this function and the one in the last yield just about the same score on average
    I believe this is because the two datasets are essentially the same at this point:
    They both have label encoded classes
    The only difference is this function removed nans and outliers, which the dataset does not possess many of anyway
    And also normalizes the dataset, which from what my understanding might not actually change the values 
    in relation to other values. This normalization may just make the model in this function more efficient!
    Due to this potential boost in efficiency due to normalization, I would choose this function's model over the last 
    '''
    print(rf['accuracy'])
    return rf
Beispiel #2
0
    def plot_first_graph(n_clicks, dataset_name, x_column, y_column,
                         graph_type):

        if n_clicks == None:
            return go.Figure(), 'Rows Scanned : 0'

        if dataset_name == 'iris':
            df = iris_df
            rows_count = len(iris_df)
        elif dataset_name == 'video_game':
            df = video_game_df
            rows_count = len(video_game_df)
        elif dataset_name == 'life_expectancy':
            df = life_expectancy_df
            rows_count = len(life_expectancy_df)
        else:
            df = None

        categorical_cols = get_text_categorical_columns(df)

        if x_column in categorical_cols:
            le = generate_label_encoder(df[x_column])
            df = replace_with_label_encoder(df, x_column, le)

        if y_column in categorical_cols:
            le = generate_label_encoder(df[y_column])
            df = replace_with_label_encoder(df, y_column, le)

        if graph_type == 'scatter':
            first_figure = px.scatter(df, x=x_column, y=y_column)

        elif graph_type == 'histogram':
            first_figure = px.histogram(df, x=x_column, color=y_column)

        elif graph_type == 'polar':
            first_figure = px.scatter_polar(df, r=x_column, theta=y_column)

        else:
            first_figure = None

        final_rows_call = 'Rows Read : ' + str(rows_count)

        return first_figure, final_rows_call
Beispiel #3
0
def reusing_code_random_forest_on_iris() -> Dict:
    """
    Again I will run a classification on the iris dataset, but reusing
    the existing code from assignment1. Use this to check how different the results are (score and
    predictions).
    """
    df = read_dataset(Path('..', '..', 'iris.csv'))
    for c in list(df.columns):
        # Notice that I am now passing though all columns.
        # If your code does not handle normalizing categorical columns, do so now (just return the unchanged column)
        df = fix_outliers(df, c)
        df = fix_nans(df, c)
        df[c] = normalize_column(df[c])

    X, y = df.iloc[:, :4], df.iloc[:, 4]
    le = generate_label_encoder(y)

    # Be careful to return a copy of the input with the changes, instead of changing inplace the inputs here!
    y_encoded = replace_with_label_encoder(y.toframe(),
                                           column='species',
                                           le=le)
    return simple_random_forest_classifier(X, y_encoded['species'])
Beispiel #4
0
def iris_clusters() -> Dict:
    """
    Let's use the iris dataset and clusterise it:
    """
    df = pd.read_csv(Path('..', '..', 'iris.csv'))
    for c in list(df.columns):
        df = fix_outliers(df, c)
        df = fix_nans(df, c)
        df[c] = normalize_column(df[c])

    # Let's generate the clusters considering only the numeric columns first
    no_species_column = simple_k_means(df.iloc[:, :4])

    ohe = generate_one_hot_encoder(df['species'])
    df_ohe = replace_with_one_hot_encoder(df, 'species', ohe,
                                          list(ohe.get_feature_names()))

    # Notice that here I have binary columns, but I am using euclidean distance to do the clustering AND score evaluation
    # This is pretty bad
    no_binary_distance_clusters = simple_k_means(df_ohe)

    # Finally, lets use just a label encoder for the species.
    # It is still bad to change the labels to numbers directly because the distances between them does not make sense
    le = generate_label_encoder(df['species'])
    df_le = replace_with_label_encoder(df, 'species', le)
    labeled_encoded_clusters = simple_k_means(df_le)

    # See the result for yourself:
    print(no_species_column['score'], no_binary_distance_clusters['score'],
          labeled_encoded_clusters['score'])
    ret = no_species_column
    if no_binary_distance_clusters['score'] > ret['score']:
        print('no binary distance')
        ret = no_binary_distance_clusters
    if labeled_encoded_clusters['score'] > ret['score']:
        print('labeled encoded')
        ret = labeled_encoded_clusters
    return ret
def train_life_expectancy() -> Dict:
    """
    Do the same as the previous task with the result of the life expectancy task of e_experimentation.
    The label column is the value column. Remember to convert drop columns you think are useless for
    the machine learning (say why you think so) and convert the remaining categorical columns with one_hot_encoding.
    Feel free to change your e_experimentation code (changes there will not be considered for grading
    purposes) to optimise the model (e.g. score, parameters, etc).
    """

    df = process_life_expectancy_dataset()
    '''
    !!!Explanation!!!
    This code below is interesting, and it aims to fix several Nan values that existed as expectancy values
    I originally intended to replace the expectancy with the mean of the column as I did in the classification file,
    BUT this cause the issue of a large number of data instances having the same target value, this resulted in 
    regressors with very low scores. This wasn't good so I needed to come up with a new solution and I decided to 
    drop each row which had a Nan expectancy value, and simply work with the data that we had.
    You'll see that this resulted in a model with a very high score, and I will discuss it more in the results section.

    Additionally this drops various instances which had year set to Nan, I did not think I could represent date 
    by replacing the Nan values with anything from the other instances of the Year column, so I would have dropped those
    values anyway.
    '''
    df['expectancy'].fillna(value=0, inplace=True)
    df = df[df['expectancy'] != 0]
    print(df)

    le = generate_label_encoder(df['name'])
    df = replace_with_label_encoder(df, 'name', le)
    print(df)

    X = df
    X = X.drop(['expectancy'], axis=1)
    y = df['expectancy']

    rf = simple_random_forest_regressor(X=X, y=y)
    print(rf)

    dt = decision_tree_regressor(X=X, y=y)
    print(dt)
    '''
    !!!My Results!!!
    The models perform very well! BUT I think this may be due to the model being overfit. It seems that there is some
    risk of contamination with this data set. There are many instances which are essentially the same, except for the 
    year value. These values would risk being split into both the training and testing dataset, and the model might
    train and learn one instance just to be tested on what is a very, very similar version of that instance.
    It is like the model is cheating! And I do not predict it would do so well on new, real life data.

    In order to reduce risk of overfitting, while also not removing any more instances, I would collect more data in
    order to include information specific to the year and the country, this way the model would have more information
    about the demographics and conditions the country held and could perhaps find patterns to predict life expectancy
    that way.

    To quickly conclude my observation on dropping expectancy values from before,
    I do not think setting the expectancy values should have been set to the mean as they were what was being predicted,
    and by extracting a value for the prediction that way causes the model to misrepresent the real data I think.
    In order to include the dropped data in future tests, we would simply have to find the actual data for those 
    instances, as I do not believe we should be using false values for such an important feature! To do this, we could
    search through government papers or statistical analyses for the target country. Once we have the complete data
    profile of that country, then I would feel more comfortable adding it back into the data set.
    '''

    if rf['score'] > dt['score']:
        print('random forest wins!')
        return rf
    else:
        print('decision tree wins!')
        return dt
def train_amazon_video_game() -> Dict:
    """
    Run the result of the amazon dataset task of e_experimentation using the
    decision tree regressor AND random_forest regressor. Return the one with lowest R^2.
    The Label column is the count column
    Discuss (1 sentence) what you found different between the results.
    In one sentence, why is the score different (or the same) compared to the iris score?
    Feel free to change your e_experimentation code (changes there will not be considered for grading
    purposes) to optimise the model (e.g. score, parameters, etc).
    """
    df = process_amazon_video_game_dataset()
    print(df)
    '''
    !!!Explanation!!!
    I used this same logic in the classification file, but we would need many and more bits to represent a one hot
    encoding of all of the movie names, so instead I just label encode them, though they still represent the same data
    '''
    le = generate_label_encoder(df['asin'])
    df = replace_with_label_encoder(df, 'asin', le)
    '''
    !!!Explanation!!!
    Note: this is the same explanation I gave for dropping the time column in the classification file, but I 
    thought I would restate it here just in case.
    I decided to drop the time column as I personally don't think it will have a correlation with the target labels.
    The time only seems to indicate the activity of the user, which is easily updates once the user reviews again.
    Thus, my theory is that the model might learn to check when a user is active, which could overfit the model if user
    activity is somewhat random.
    For example, if they reviewed a video game that came out today, after not reviewing one after 10 years,
    the model may not predict the user because it is biased to the activity dates.
    Sometimes sequels to games come out after a long, long time as any video game fan knows, and perhaps a player might
    want to review the newest sequel of a game series they used to like to review.
    I believe the model should be able to predict the user from other features relating to the users rating behaviours,
    but should be independent of time, as there are no set rules to when a user might review
    '''
    df = df.drop('time', axis=1)

    X, y = df.iloc[:, 1:], df.iloc[:, 0]

    rf = simple_random_forest_regressor(X, y)
    print(rf)

    dt = decision_tree_regressor(X, y)
    print(dt)
    '''
    !!!My Results!!!
    Using the original data with little processing other than encoding, we get a models that have scores of around 
    0.9999 on average. This may look very good but I don't think there is any way this could be realistic.
    I think there are two pretty big issues with this data set, both of which I discuss in the classification file
    but they seem even more apparent here so I will reiterate my analysis.
    Issue #1: Unique Labels.
        - A unique label refers to an instance of data in the data set that possesses a label, which no other instances
        in that dataset possess. I believe this is an issue in regression and classification as if that instance is put
        in a test set, the label will not be learned by the model. This becomes an issue as it will be automatically 
        wrongly predicted. 
        - I believe a solution to this would be to simply choose better labels; ones that can describe the data in a 
        more general way. I believe the count feature is problematic as it is somewhat trivial (you can just count the
        number of reviews for each user) and without an idea of that counting pattern, it will be somewhat of a guess!
    Issue #2: Data Duplication.
        - Although it seems this may contradict issue #1, I think there should be a nice mix of data that have the same 
        labels for supervised learning, but are different, I think this is the basis of a balanced data set.
        - The issue here is that a lot of data instances in this data set are extremely similar to others, apart from 
        the video game being reviewed. This can lead to overfitting of the model.
        - This leads to overfitting of the model as these duplicates can show up in both the training set AND the test
        set, leading to contamination of the sets. I've talked about this a lot throughout the assignment but the model
        will learn the instances in training, and then be tested on those same instances for the score function. This is
        really bad because the score now misrepresents the models ability to predict actually new data.
        - I don't think its as an easy fix as issue #1, but I think one solution may be more data collection; features
        that are not just unique to the class, but also unique to the individual instances must be collected! This data
        diversity in theory should reduce a lot of the contamination, and thus reduce the overfiting in the model.
    '''
    '''
    !!!Versus the Iris Data Set!!!
    I think the only thing to say here should be obvious from my responses to the iris function and this one. The big
    differences between these results is that there is evident overfitting going on with this data set, and not too much
    going on with the iris set. 
    I believe the biggest differences of these data sets (if we were to somehow standardize them relative to each other)
    is that the iris data set is a lot more balanced, and does not contain as much potential for train/test
    contamination. I believe if you balanced out the data and fix the duplication issues (as listed above) in the 
    Amazon data set, it could have the potential to have a model learn it just as well as it learns the iris data set.
    Although the Amazon data sets models had better scores, I believe that is just due to various instances of 
    overfitting, and thus the more balanced Iris data set produced better models.
    '''

    if rf['score'] < dt['score']:
        print('random forest wins!')
        return rf
    else:
        print('decision tree wins!')
        return dt
def your_choice() -> Dict:
    """
    Now choose one of the datasets included in the assignment1 (the raw one, before anything done to them)
    and decide for yourself a set of instructions to be done (similar to the e_experimentation tasks).
    Specify your goal (e.g. analyse the reviews of the amazon dataset), say what you did to try to achieve the goal
    and use one (or both) of the models above to help you answer that. Remember that these models are classification
    models, therefore it is useful only for categorical labels.
    We will not grade your result itself, but your decision-making and suppositions given the goal you decided.
    Use this as a small exercise of what you will do in the project.
    """
    '''
    !!!My Goal!!!
    I will be using the dataset "Geography"
    With this dataset, I want to find out if we can fit a model to predict the World Bank Income Group of a country
    given a some geographical and bank related features
    To find this out, I will preprocess the data in the following ways:
        - Fix any missing data in the columns that are mentioned below
        - Extract and label encode the World Bank groups column into the labels vector 
        - Extract and one hot encode World bank region column into the features vector
        - Extract latitude into the features vector
        - Extract longitude into the features vector
    I will train both a Decision Tree and Random Forest to find my goal, and return the model with the greater accuracy
    '''
    df = pd.read_csv(Path('..', '..', 'geography.csv'))

    '''
    !!!Explanation!!!
    The only columns with Nans for the target features for this were from the Vatican, 
    so I replaced their null values with the values from Italy.
    I know they are technically separate, but until the data set can be filled we will simply consider them the same.
    '''
    df['World bank region'].fillna(value='Europe & Central Asia', inplace=True)
    df['World bank, 4 income groups 2017'].fillna('High Income', inplace=True)

    le = generate_label_encoder(df_column=df['World bank, 4 income groups 2017'])
    df = replace_with_label_encoder(df=df, column='World bank, 4 income groups 2017', le=le)

    ohe = generate_one_hot_encoder(df_column=df['World bank region'])
    df = replace_with_one_hot_encoder(df=df, column='World bank region', ohe=ohe,
                                      ohe_column_names=ohe.get_feature_names())

    columns = ['Latitude', 'Longitude', 'x0_East Asia & Pacific', 'x0_Europe & Central Asia',
               'x0_Latin America & Caribbean', 'x0_Middle East & North Africa', 'x0_North America',
               'x0_South Asia', 'x0_Sub-Saharan Africa']
    X = df[columns]
    y = df['World bank, 4 income groups 2017']

    dt = decision_tree_classifier(X=X, y=y)
    #print(dt)
    rf = simple_random_forest_classifier(X=X, y=y)
    #print(rf)
    '''
    !!!My Results!!!
    It seems that once again on average the Decision Tree and Random Forest are yielding similar results.
    Their accuracies are quite low, and range from around 50 to nearly 70 percent accuracy.
    I don't think a lot of overfitting is occurring here, as the datasets are well balanced, and properly split
    into training and testing.
    The data set does have a lack of columns that relate to the economy, wealth, or demographics of the country,
    So I believe that more data may improve the model to fit a mapping between the demographic and wealth data of a
    given country, and its income group (target label).
    Features that could be collected as additional data columns could include things such as average income, employment
    rate, tax information, and more!
    I believe although this model is just a start, it could be beneficial to companies who are figuring out economic
    policies or tax plans. I believe, the ability to use this model while trying to come up with plans to benefit a 
    country's economy could be useful, with enough relevant training and data :)
    '''
    if rf['accuracy'] > dt['accuracy']:
        #print('random forest wins')
        return rf
    else:
        #print('decision tree wins')
        return dt
def train_life_expectancy() -> Dict:
    """
    Do the same as the previous task with the result of the life expectancy task of e_experimentation.
    The label column is the column which has north/south. Remember to convert drop columns you think are useless for
    the machine learning (say why you think so) and convert the remaining categorical columns with one_hot_encoding.
    (check the c_regression examples to see example on how to do this one hot encoding)
    Feel free to change your e_experimentation code (changes there will not be considered for grading
    purposes) to optimise the model (e.g. score, parameters, etc).
    """
    df = process_life_expectancy_dataset()
    '''
    !!!Explanation!!!
    I dropped the year column as there are many and more Nan values within
    It is not really a value you can simply fix by average the columns that are not empty
    Logically that would not make sense, and I believe by doing that the year column would become misrepresented
    I do not predict this to affect accuracy all that much as year should not have that big of an impact on the 
    classification of the country being in the north or south, as this function is doing
    '''
    df = df.drop(['year'], axis=1)
    '''
    !!!Explanation!!!
    The expectancy column also has a lot of Nan values, so I decided to replace those Nans with the average of that 
    column. I believe this is appropriate as the life expectancy is probably around the same range for each country in
    this dataset, so taking the average of it is a good measure of the life expectancy for any country.
    Note: This hypothesis may not be great as the range of expectancy is quite large, from my preprocessing it will be 
    around 75 years; but given that some countries are developing, as well as the data being from many years ago,
    for now I believe the mean can still give a better representation than nothing!  
    '''
    mean = get_column_mean(df, 'expectancy')
    df['expectancy'].fillna(value=mean, inplace=True)
    X = df
    X = X.drop(['latitude'], axis=1)
    y = df['latitude']
    print(X)
    print(y)

    '''
    !!! Explanation !!!
    I decided to label encode the country name
    I could not leave them as strings as the model would not be able to read it, and I think one hot encoding the names
    would be very space innificient as there are many different country names, and we would need a lot of bits to 
    one hot encode them all!
    '''
    le = generate_label_encoder(X['name'])
    X['name'] = le.fit_transform(X['name'])

    rf = simple_random_forest_classifier(X, y)
    dt = decision_tree_classifier(X, y)

    '''
    !!!Explanation!!!
    Both the decision tree and the random forest are performing very well, both with ~.99 accuracy scores.
    From the results, both performed much better than any function we have classified before.
    I am inclined to believe that this data set has lead to some overfitting, due to an unbalanced dataset.
    The dataset for example, has the country Afghanistan many times, each attribute being the same as the year has been
    removed and many of the expectancy missing values are set to that columns mean.
    This introduces overfitting because the duplicate data instances may go into both the training and testing set,
    contamination!! This is not good as the model will be tested on things it already knows, giving it 100% on it 
    almost automatically... kind of like the model is cheating on a test. Given a completely brand new data set,
    I think the models performance would drop.

    Due to this data imbalance, I don't think this dataset is that great to run classification on, even with all of the 
    preprocessing. I believe a solution to this would be to of course balance out the data set, by collecting more 
    information about other countries that are less represented in the dataset, as well as add dimensions that are not 
    so redundant as missing or mean expectancies; perhaps more general features relating to the weather if we are still
    trying to predict if it is in the north or south.
    '''
    if rf['accuracy'] > dt['accuracy']:
        print('random forest wins')
        return rf
    else:
        print('decision tree wins')
        return dt