def main(): args = parse_arguments() df = dslr.read_csv(args.csvfile) df.get_numerical_features() df.remove_nan() df.standardize() numerical_features = df.numerical_features df_standardized = dslr.DataFrame(data=df.standardized) dfs_by_house = { house: df_standardized.get_df_filtered({"Hogwarts House": house}) for house in list(set(df.data["Hogwarts House"])) } print( "The 2 features Astronomy and Defense Against the Dark Arts are similar" ) for i in range(len(numerical_features) - 1): for j in range(i + 1, len(numerical_features)): fig, ax = plt.subplots(1, 1, figsize=(6, 6)) for _, df in dfs_by_house.items(): x_label = numerical_features[i] y_label = numerical_features[j] ax.scatter(df.data[x_label], df.data[y_label]) plt.xlabel(x_label) plt.ylabel(y_label) plt.title("Scatter Plot") plt.legend([house for house, _ in dfs_by_house.items()]) plt.show()
def main(): args = parse_arguments() df = dslr.read_csv(args.csvfile) df.get_numerical_features() df.remove_nan() df.standardize() df_standardized = dslr.DataFrame(data=df.standardized) dfs_by_house = { house: df_standardized.get_df_filtered({"Hogwarts House": house}) for house in list(set(df.data["Hogwarts House"])) } fig, axes = scatter_plot_matrix_from_dict(dfs_by_house, df.numerical_features) plt.show(fig)
def main(): args = parse_arguments() df = dslr.read_csv(args.csvfile) df.get_numerical_features() df.digitalize() df.replace_nan() df.standardize() data = df.standardized to_train = { "Gryffindor": ["History of Magic", "Transfiguration"], "Hufflepuff": df.numerical_features, "Ravenclaw": ["Charms", "Muggle Studies"], "Slytherin": ["Divination"], } to_save = {} for house, features in to_train.items(): to_save[house] = {} Y = transform_label(data["Hogwarts House"], house) X = get_X(data, features) logistic_regressor = LogisticRegressor(X, Y) if args.verbose: print("Training one classifier on class", house) logistic_regressor.train(print_cost=args.verbose) theta = [ logistic_regressor.theta[i] / df.stand_coefs[feature]["sigma"] for i, feature in enumerate(features) ] cte = -sum([ df.stand_coefs[feature]["mu"] * logistic_regressor.theta[i] / df.stand_coefs[feature]["sigma"] for i, feature in enumerate(features) ]) to_save[house]["cte"] = cte for i, feature in enumerate(features): to_save[house][feature] = theta[i] for feature in df.numerical_features: to_save[house].setdefault(feature, 0) with open('weights.json', 'w') as outfile: json.dump(to_save, outfile) outfile.close()
def main(): args = parse_arguments() df = dslr.read_csv(args.csvfile) del df.data["Hogwarts House"] df.get_numerical_features() df.digitalize() df.replace_nan() weights = json.load(open(args.weights)) theta_by_house = { house: [weights[house][feature] for feature in df.numerical_features] for house, features in weights.items() } predictions = [] probas = {} for i in range(len(df.data["Index"])): x = [df.data[feature][i] for feature in df.numerical_features] for house, _ in weights.items(): cte = weights[house]["cte"] theta = theta_by_house[house] probas[house] = logistic_function(cte + scalar_product(theta, x)) predict_house = "Gryffindor" proba_max = probas["Gryffindor"] for house, proba in probas.items(): if proba > proba_max: proba_max = proba predict_house = house predictions.append([int(df.data["Index"][i]), predict_house]) with open('houses.csv', 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=['Index', 'Hogwarts House'], lineterminator='\n') writer.writeheader() for prediction in predictions: writer.writerow({ 'Index': prediction[0], 'Hogwarts House': prediction[1] }) csvfile.close()
def main(): args = parse_arguments() df = dslr.read_csv(args.csvfile) df.get_numerical_features() df.remove_nan() df.standardize() df_standardized = dslr.DataFrame(data=df.standardized) dfs_by_house = { house: df_standardized.get_df_filtered({"Hogwarts House": house}) for house in list(set(df.data["Hogwarts House"])) } print("The Care of Magical Creatures course has a homogeneous", "distribution of marks between the 4 houses") for feature in df.numerical_features: values = df.data[feature] to_plot = [df.data[feature] for house, df in dfs_by_house.items()] plt.hist(to_plot) plt.xlabel("Notes") plt.ylabel("Frequency") plt.legend([house for house, _ in dfs_by_house.items()]) plt.title(feature) plt.show()
def main(): args = parse_arguments() df = dslr.read_csv(args.csvfile) df.get_numerical_features() df.digitalize() df.replace_nan() df_train, df_test = df.train_test_split() df_train.standardize() df_train.digitalize() df_test.digitalize() print("len train:", len(df_train.data["Hogwarts House"])) print("len test:", len(df_test.data["Hogwarts House"])) data = df_train.standardized to_train = { "Gryffindor": ["History of Magic", "Transfiguration"], "Hufflepuff": df.numerical_features, "Ravenclaw": ["Charms", "Muggle Studies"], "Slytherin": ["Divination"], #"Gryffindor": df.numerical_features, #"Hufflepuff": df.numerical_features, #"Ravenclaw": df.numerical_features, #"Slytherin": df.numerical_features, } to_save = {} for house, features in to_train.items(): to_save[house] = {} Y = transform_label(data["Hogwarts House"], house) X = get_X(data, features) logistic_regressor = LogisticRegressor(X, Y) if args.verbose: print("Training one classifier on class", house) logistic_regressor.train(print_cost=args.verbose, max_iter=2000) theta = [logistic_regressor.theta[i] / df_train.stand_coefs[feature]["sigma"] for i, feature in enumerate(features)] cte = -sum([df_train.stand_coefs[feature]["mu"] * logistic_regressor.theta[i] / df_train.stand_coefs[feature]["sigma"] for i, feature in enumerate(features)]) to_save[house]["cte"] = cte for i, feature in enumerate(features): to_save[house][feature] = theta[i] for feature in df.numerical_features: to_save[house].setdefault(feature, 0) nb_error = 0 Y_true = df_test.data["Hogwarts House"] Y_pred = [] predictions = {} for i, real_house in enumerate(df_test.data["Hogwarts House"]): x = [df_test.data[feature][i] for feature in df.numerical_features] for house in list(set(df_test.data["Hogwarts House"])): cte = to_save[house]["cte"] theta = [to_save[house][feature] for feature in df.numerical_features] predictions[house] = logistic_function(cte + scalar_product(theta, x)) predict_house = "Gryffindor" proba_max = predictions["Gryffindor"] for house, proba in predictions.items(): if proba > proba_max: proba_max = proba predict_house = house Y_pred.append(predict_house) if predict_house != real_house: print(predictions, "real:", real_house, " predict:", predict_house) nb_error += 1 print("precision:", 1 - nb_error / len(df_test.data["Hogwarts House"])) print(accuracy_score(Y_true, Y_pred))