Ejemplo n.º 1
0
def main():
    args = parse_arguments()
    df = dslr.read_csv(args.csvfile)
    df.get_numerical_features()
    df.remove_nan()
    df.standardize()
    numerical_features = df.numerical_features
    df_standardized = dslr.DataFrame(data=df.standardized)
    dfs_by_house = {
        house: df_standardized.get_df_filtered({"Hogwarts House": house})
        for house in list(set(df.data["Hogwarts House"]))
    }
    print(
        "The 2 features Astronomy and Defense Against the Dark Arts are similar"
    )
    for i in range(len(numerical_features) - 1):
        for j in range(i + 1, len(numerical_features)):
            fig, ax = plt.subplots(1, 1, figsize=(6, 6))
            for _, df in dfs_by_house.items():
                x_label = numerical_features[i]
                y_label = numerical_features[j]
                ax.scatter(df.data[x_label], df.data[y_label])
                plt.xlabel(x_label)
                plt.ylabel(y_label)
            plt.title("Scatter Plot")
            plt.legend([house for house, _ in dfs_by_house.items()])
            plt.show()
Ejemplo n.º 2
0
def main():
    args = parse_arguments()
    df = dslr.read_csv(args.csvfile)
    df.get_numerical_features()
    df.remove_nan()
    df.standardize()
    df_standardized = dslr.DataFrame(data=df.standardized)
    dfs_by_house = {
        house: df_standardized.get_df_filtered({"Hogwarts House": house})
        for house in list(set(df.data["Hogwarts House"]))
    }
    fig, axes = scatter_plot_matrix_from_dict(dfs_by_house,
                                              df.numerical_features)
    plt.show(fig)
Ejemplo n.º 3
0
def main():
    args = parse_arguments()
    df = dslr.read_csv(args.csvfile)
    df.get_numerical_features()
    df.digitalize()
    df.replace_nan()
    df.standardize()
    data = df.standardized
    to_train = {
        "Gryffindor": ["History of Magic", "Transfiguration"],
        "Hufflepuff": df.numerical_features,
        "Ravenclaw": ["Charms", "Muggle Studies"],
        "Slytherin": ["Divination"],
    }
    to_save = {}
    for house, features in to_train.items():
        to_save[house] = {}
        Y = transform_label(data["Hogwarts House"], house)
        X = get_X(data, features)
        logistic_regressor = LogisticRegressor(X, Y)
        if args.verbose:
            print("Training one classifier on class", house)
        logistic_regressor.train(print_cost=args.verbose)
        theta = [
            logistic_regressor.theta[i] / df.stand_coefs[feature]["sigma"]
            for i, feature in enumerate(features)
        ]
        cte = -sum([
            df.stand_coefs[feature]["mu"] * logistic_regressor.theta[i] /
            df.stand_coefs[feature]["sigma"]
            for i, feature in enumerate(features)
        ])
        to_save[house]["cte"] = cte
        for i, feature in enumerate(features):
            to_save[house][feature] = theta[i]
        for feature in df.numerical_features:
            to_save[house].setdefault(feature, 0)

    with open('weights.json', 'w') as outfile:
        json.dump(to_save, outfile)
        outfile.close()
Ejemplo n.º 4
0
def main():
    args = parse_arguments()
    df = dslr.read_csv(args.csvfile)
    del df.data["Hogwarts House"]
    df.get_numerical_features()
    df.digitalize()
    df.replace_nan()
    weights = json.load(open(args.weights))
    theta_by_house = {
        house: [weights[house][feature] for feature in df.numerical_features]
        for house, features in weights.items()
    }
    predictions = []
    probas = {}
    for i in range(len(df.data["Index"])):
        x = [df.data[feature][i] for feature in df.numerical_features]
        for house, _ in weights.items():
            cte = weights[house]["cte"]
            theta = theta_by_house[house]
            probas[house] = logistic_function(cte + scalar_product(theta, x))
        predict_house = "Gryffindor"
        proba_max = probas["Gryffindor"]
        for house, proba in probas.items():
            if proba > proba_max:
                proba_max = proba
                predict_house = house

        predictions.append([int(df.data["Index"][i]), predict_house])

    with open('houses.csv', 'w') as csvfile:
        writer = csv.DictWriter(csvfile,
                                fieldnames=['Index', 'Hogwarts House'],
                                lineterminator='\n')
        writer.writeheader()
        for prediction in predictions:
            writer.writerow({
                'Index': prediction[0],
                'Hogwarts House': prediction[1]
            })
        csvfile.close()
Ejemplo n.º 5
0
def main():
    args = parse_arguments()
    df = dslr.read_csv(args.csvfile)
    df.get_numerical_features()
    df.remove_nan()
    df.standardize()
    df_standardized = dslr.DataFrame(data=df.standardized)
    dfs_by_house = {
        house: df_standardized.get_df_filtered({"Hogwarts House": house})
        for house in list(set(df.data["Hogwarts House"]))
    }
    print("The Care of Magical Creatures course has a homogeneous",
          "distribution of marks between the 4 houses")
    for feature in df.numerical_features:
        values = df.data[feature]
        to_plot = [df.data[feature] for house, df in dfs_by_house.items()]
        plt.hist(to_plot)
        plt.xlabel("Notes")
        plt.ylabel("Frequency")
        plt.legend([house for house, _ in dfs_by_house.items()])
        plt.title(feature)
        plt.show()
Ejemplo n.º 6
0
def main():
    args = parse_arguments()
    df = dslr.read_csv(args.csvfile)
    df.get_numerical_features()
    df.digitalize()
    df.replace_nan()
    df_train, df_test = df.train_test_split()
    df_train.standardize()
    df_train.digitalize()
    df_test.digitalize()
    print("len train:", len(df_train.data["Hogwarts House"]))
    print("len test:", len(df_test.data["Hogwarts House"]))
    data = df_train.standardized
    to_train = {
            "Gryffindor": ["History of Magic", "Transfiguration"],
            "Hufflepuff": df.numerical_features,
            "Ravenclaw": ["Charms", "Muggle Studies"],
            "Slytherin": ["Divination"],
            #"Gryffindor": df.numerical_features,
            #"Hufflepuff": df.numerical_features,
            #"Ravenclaw": df.numerical_features,
            #"Slytherin": df.numerical_features,
    }
    to_save = {}
    for house, features in to_train.items():
        to_save[house] = {}
        Y = transform_label(data["Hogwarts House"], house)
        X = get_X(data, features)
        logistic_regressor = LogisticRegressor(X, Y)
        if args.verbose:
            print("Training one classifier on class", house)
        logistic_regressor.train(print_cost=args.verbose, max_iter=2000)
        theta = [logistic_regressor.theta[i] / df_train.stand_coefs[feature]["sigma"] for
                i, feature in enumerate(features)]
        cte = -sum([df_train.stand_coefs[feature]["mu"] * logistic_regressor.theta[i] /
            df_train.stand_coefs[feature]["sigma"] for i, feature in enumerate(features)])
        to_save[house]["cte"] = cte
        for i, feature in enumerate(features):
            to_save[house][feature] = theta[i]
        for feature in df.numerical_features:
            to_save[house].setdefault(feature, 0)
    nb_error = 0
    Y_true = df_test.data["Hogwarts House"]
    Y_pred = []
    predictions = {}
    for i, real_house in enumerate(df_test.data["Hogwarts House"]):
        x = [df_test.data[feature][i] for feature in df.numerical_features]
        for house in list(set(df_test.data["Hogwarts House"])):
            cte = to_save[house]["cte"]
            theta = [to_save[house][feature] for feature in df.numerical_features]
            predictions[house] = logistic_function(cte + scalar_product(theta, x))

        predict_house = "Gryffindor"
        proba_max = predictions["Gryffindor"]
        for house, proba in predictions.items():
            if proba > proba_max:
                proba_max = proba
                predict_house = house
        Y_pred.append(predict_house)
        if predict_house != real_house:
            print(predictions, "real:", real_house, "  predict:", predict_house)
            nb_error += 1

    print("precision:", 1 - nb_error  / len(df_test.data["Hogwarts House"]))
    print(accuracy_score(Y_true, Y_pred))