Ejemplos de DataSet en Python, ejemplos de describe.DataSet en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: scatter_plot.py Proyecto: jcartwri/MLP

    def Plot(self, name_col1=7, name_col2=8, house_class=['M', 'B']):
        """
        Plot built scatter Plot two columns.

        :param name_col1: first numeric column from dataset. Can accept index (int) and name (str) columns
        :param name_col2:   second numeric column from dataset. Can accept index (int) and name (str) columns
        :param house_class: It's classes from Hogwarts House, default use all four classes.
        :return:
        """
        ds = DataSet(self.file_name)
        col_mas_name = [name_col1, name_col2]
        for i in range(2):
            if (type(col_mas_name[i]) is str):
                if col_mas_name[i] in ds.dataset[0]:
                    col_mas_name[i] = ds.dataset[0].index(col_mas_name[i])
                else:
                    print('Error: bad name column')
                    return

        for i in range(2):
            if col_mas_name[i] < 0 or col_mas_name[i] >= len(ds.dataset[0]):
                print("Error: This isn't column")
                return
            if not ds.isNumeric_columns(col_mas_name[i]):
                print("Error: Input column must is numerics")
                return
        if self.size > (len(ds.dataset) - 1):
            self.size = len(ds.dataset) - 1
        col1 = ds.get_float_col(col_mas_name[0])
        col2 = ds.get_float_col(col_mas_name[1])
        color = {
            'M': 'b',
            'B': 'r',
        }

        feature1 = {}
        feature2 = {}
        #         house_class = [i for i in house_class if i in set(ds.get_col(self.y_col))] if house_class else set(ds.get_col(self.y_col))
        #         house_class = set(ds.get_col(self.y_col)) if not house_class else house_class
        for i in house_class:
            feature1[i] = []
            feature2[i] = []
        for i in range(1, len(ds.dataset)):
            if ds.dataset[i][self.y_col] in house_class:
                feature1[ds.dataset[i][self.y_col]].append(col1[i - 1])
                feature2[ds.dataset[i][self.y_col]].append(col2[i - 1])
        for i in feature1.keys():
            plot.scatter(feature1[i][:self.size],
                         feature2[i][:self.size],
                         c=color[i],
                         alpha=0.5,
                         label=i)
        if self.legend:
            plot.legend(loc='upper right')
        plot.ylabel('column is {}'.format(col_mas_name[1]))
        plot.xlabel('column is {}'.format(col_mas_name[0]))
        plot.title('Scatter Plot')
        plot.savefig('data/scatter_plot.png')
        plot.show()

Ejemplo n.º 2

0

Mostrar archivo

def main():

    dirname = os.path.dirname(__file__)
    output_dirname = os.path.join(dirname, 'results')

    try:
        os.stat(output_dirname)
    except:
        os.mkdir(output_dirname)

    file_name = sys.argv[1]
    dirname = os.path.dirname(__file__)
    file_name = os.path.join(dirname, file_name)

    d = DataSet(file_name)
    d.loadDataSet()

    to_remove = [
        d.data_set[0].index('Index'),
        d.data_set[0].index('First Name'),
        d.data_set[0].index('Last Name'),
        d.data_set[0].index('Birthday'),
        d.data_set[0].index('Best Hand'),
        d.data_set[0].index('Hogwarts House'),

        # Tests 7/10/18
        d.data_set[0].index('Arithmancy'),
        d.data_set[0].index('Defense Against the Dark Arts'),
        d.data_set[0].index('Divination'),
        d.data_set[0].index('Muggle Studies'),
        d.data_set[0].index('History of Magic'),
        d.data_set[0].index('Transfiguration'),
        d.data_set[0].index('Potions'),
        d.data_set[0].index('Care of Magical Creatures'),
        d.data_set[0].index('Charms'),
        d.data_set[0].index('Flying'),
    ]

    X = np.array([[
        d.data_set[i][j] for j in range(len(d.data_set[0]))
        if j not in to_remove
    ] for i in range(len(d.data_set))])
    #features = X[0,:]
    X = convert_to_float(X[1:, ])

    y_col_nb = d.data_set[0].index('Hogwarts House')
    y = np.array(d.extractColumn(y_col_nb)[1:])

    m = MeanImputation(X)
    m.train()
    m.transform()

    sc = Scaling(X)
    sc.train()
    sc.transform()

    l = LogisticRegression(X=X, y=y)
    l.train()

Ejemplo n.º 3

0

Mostrar archivo

 def __init__(self, path_to_data_set='resources/dataset_train.csv', legend=True, granularity=100):
     """
     :param path_to_data_set: a string. The path to the dataset.
     :param legend: a boolean. If legend is False, only the histogram is plotted. If legend is True, titles, axis legend, etc. are plotted.
     :param granularity: an integer. The number of barplots in the histogram.
     """
     self.data_set = DataSet(path_to_data_set)
     self.data_set.loadDataSet()
     self.legend = legend
     self.granularity = granularity

Ejemplo n.º 4

0

Mostrar archivo

 def __init__(self,
              path_to_data_set='resources/dataset_train.csv',
              legend=True,
              size=10):
     """
     :param path_to_data_set: a string. The path to the dataset.
     :param legend: a boolean. If legend is False, only the histogram is plotted. If legend is True, titles, axis legend, etc. are plotted.
     :param size: an int. The size of the points in the scatter plot.
     """
     self.data_set = DataSet(path_to_data_set)
     self.data_set.loadDataSet()
     self.legend = legend
     self.size = size

Ejemplo n.º 5

0

Mostrar archivo

 def predict_file(self, theta=np.array([]), theta_exit=0):
     df = DataSet(filename=self.file)
     df.find_numeric_label()
     X = self.get_x_y(df, return_y=False)
     X = self.__add_intercept(X)
     self.theta = np.array(theta)
     if not theta and theta_exit:
         print("Error: Have not theta")
         sys.exit()
     if not theta:
         self.theta = np.ones(X.shape[1])
     if self.theta.shape[0] != X.shape[1]:
         print('Error: bad theta or X')
         sys.exit()
     return [self.predict(X), self.predict_prob(X)]

Ejemplo n.º 6

0

Mostrar archivo

 def __init__(self,
              path_to_data_set='resources/dataset_train.csv',
              max_nb_features=4,
              fig_size=(8, 8)):
     """
     :param path_to_data_set: a string. The path to the dataset.
     :param max_nb_features: an integer. The number of features to analyze - analysis will start from the first feature (on the left) and continue until reaching the number max of features.
                             This was necessary for the sake of readability (there are ~10 numeric features, which would lead to 10**2 = 100 plots to do.
     :param fig_size: an integer tuple. The size of the figure to output.
     """
     self.path_to_data_set = path_to_data_set
     self.data_set = DataSet(self.path_to_data_set)
     self.data_set.loadDataSet()
     self.max_nb_features = max_nb_features
     self.fig_size = fig_size
     self.numeric_features = []

Ejemplo n.º 7

0

Mostrar archivo

    def fit(self):
        ds = DataSet(filename=self.file)
        ds.find_numeric_label()
        X, y = self.get_x_y(ds)
        if self.fit_intercept:
            X = self.__add_intercept(X)

        self.theta = np.random.randn(X.shape[1])
        for i in range(self.num_iter):
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.theta -= self.lr * gradient
            if (self.verbose and i % 10000 == 0):
                z = np.dot(X, self.theta)
                h = self.__sigmoid(z)
                print(f'loss: {self.__loss(h, y)} \t')

Ejemplo n.º 8

0

Mostrar archivo

    def Plot(self, col_nb):
        ds = DataSet(self.file_name)
        if (type(col_nb) is str):
            if col_nb in ds.dataset[0]:
                col_nb = ds.dataset[0].index(col_nb)
            else:
                print('Error with name column')
                return

        if not ds.isNumeric_columns(col_nb):
            print("Input column must is numerics")
            return

        col = ds.get_float_col(col_nb)
        statistic = Math_calculat(col)
        bins = np.linspace(statistic.Quartile(0), statistic.Quartile(1),
                           self.size)
        color = {
            'Ravenclaw': 'b',
            'Gryffindor': 'r',
            'Slytherin': 'g',
            'Hufflepuff': 'yellow'
        }

        feature = {}
        for i in set(ds.get_col(self.y_col)):
            feature[i] = []
        for i in range(1, len(ds.dataset)):
            feature[ds.dataset[i][self.y_col]].append(col[i - 1])

        for i in feature.keys():
            plot.hist(feature[i], bins, facecolor=color[i], alpha=0.5, label=i)
        if self.legend:
            plot.legend(loc='upper right')
        plot.ylabel('Frequency')
        plot.xlabel('Value')
        plot.title('Histogram')
        plot.savefig('datasets/histogram.png')
        plot.show()

Ejemplo n.º 9

0

Mostrar archivo

Archivo: pair_plot.py Proyecto: jcartwri/DSLR

    def Plot(self):
        ds = DataSet(self.file_name)
        ds.find_numeric_label()
        if self.max_nb_columns > (len(ds.numeric_columns)):
            self.max_nb_columns = len(ds.numeric_columns)

        color = {
            'Ravenclaw': 'b',
            'Gryffindor': 'r',
            'Slytherin': 'g',
            'Hufflepuff': 'yellow'
        }

        fig, ax = plot.subplots(self.max_nb_columns, self.max_nb_columns, figsize=self.fig_size)

        fig.tight_layout()
        N = self.max_nb_columns
        for i in range(N):
            col1 = ds.get_float_col(ds.numeric_columns[i])[:self.size]
            for j in range(N):
                col2 = ds.get_float_col(ds.numeric_columns[j])[:self.size]
                feature1 = {}
                feature2 = {}
                for k in set(ds.get_col(self.y_col)):
                    feature1[k] = []
                    feature2[k] = []
                for k in range(1, len(ds.dataset[:self.size])):
                    feature1[ds.dataset[k][self.y_col]].append(col1[k - 1])
                    feature2[ds.dataset[k][self.y_col]].append(col2[k - 1])
                if i == 0:
                    ax[i, j].xaxis.set_label_position('top')
                    ax[i, j].set_xlabel(ds.dataset[0][ds.numeric_columns[j]], rotation=0)
                if j == 0:
                    ax[i, j].set_ylabel(ds.dataset[0][ds.numeric_columns[i]], rotation=0)
                if (i == j):
                    statistic = Math_calculat(col1)
                    bins = np.linspace(statistic.Quartile(0), statistic.Quartile(1))
                    for k in feature1.keys():
                        ax[i, j].hist(feature1[k], bins, facecolor=color[k], alpha=0.5, label=k)

                else:
                    for k in feature1.keys():
                        ax[i, j].scatter(feature1[k], feature2[k], c=color[k], alpha=0.5, label=k)
                ax[i, j].tick_params(labelbottom=False)
                ax[i, j].tick_params(labelleft=False)

        if self.legend:
            plot.legend(loc='lower right')
        plot.savefig('datasets/pair_plot.png')
        plot.show()

Ejemplo n.º 10

0

Mostrar archivo

Archivo: logreg_fine_tune.py Proyecto: SachaIZADI/DisruptHogwarts

def main():
    '''
    Use this script to run experiments and fine-tune the algoritms
    '''

    # Load the dataset
    file_name = sys.argv[1]
    dirname = os.path.dirname(__file__)
    file_name = os.path.join(dirname, file_name)

    d = DataSet(file_name)
    d.loadDataSet()

    # Remove useless features (not numeric + bad regressors).
    to_remove = [
        d.data_set[0].index('Index'),
        d.data_set[0].index('First Name'),
        d.data_set[0].index('Last Name'),
        d.data_set[0].index('Birthday'),
        d.data_set[0].index('Best Hand'),
        d.data_set[0].index('Hogwarts House'),

        # Tests 7/10/18
        d.data_set[0].index('Arithmancy'),
        d.data_set[0].index('Defense Against the Dark Arts'),
        d.data_set[0].index('Divination'),
        d.data_set[0].index('Muggle Studies'),
        d.data_set[0].index('History of Magic'),
        d.data_set[0].index('Transfiguration'),
        d.data_set[0].index('Potions'),
        d.data_set[0].index('Care of Magical Creatures'),
        d.data_set[0].index('Charms'),
        d.data_set[0].index('Flying'),
    ]

    X = np.array([[
        d.data_set[i][j] for j in range(len(d.data_set[0]))
        if j not in to_remove
    ] for i in range(len(d.data_set))])
    X = convert_to_float(X[1:, ])

    y_col_nb = d.data_set[0].index('Hogwarts House')
    y = np.array(d.extractColumn(y_col_nb)[1:])

    # Impute missing values
    m = MeanImputation(X)
    m.train()
    m.transform()

    # Scale the variables
    sc = Scaling(X)
    sc.train()
    sc.transform()

    # Split the dataset in a training and testing set
    sp = SplitTrainTest(X, y)
    sp.Split()
    X_train = sp.X_train
    y_train = sp.y_train
    X_test = sp.X_test
    y_test = sp.y_test

    # Train a logistic regression model
    l = LogisticRegression(X=X_train, y=y_train)
    l.train()

    # Compute the confusion matrix over the training set
    y_predicted = l.predict()

    cm1 = ConfusionMatrix(y_train, y_predicted)
    cm1.getMatrix()
    print('\n\n')
    print(
        '**************** Confusion Matrix on the training set ****************'
    )
    print('\n')
    cm1.Print()

    # Compute the confusion matrix over the testing set
    y_predicted = l.predict(X_test)

    cm2 = ConfusionMatrix(y_test, y_predicted, cm1.unique_labels)
    cm2.getMatrix()
    print('\n\n')
    print(
        '**************** Confusion Matrix on the testing set ****************'
    )
    print('\n')
    cm2.Print()

Ejemplo n.º 11

0

Mostrar archivo

class HistogramPerHouse:
    """
    - A class to plot the histogram of the Hogwarts features
    - Example to run:
        from histogram import HistogramPerHouse
        import matplotlib.pyplot as plt
        h = HistogramPerHouse()
        h.Plot(8)
        plt.show()
    """

    def __init__(self, path_to_data_set='resources/dataset_train.csv', legend=True, granularity=100):
        """
        :param path_to_data_set: a string. The path to the dataset.
        :param legend: a boolean. If legend is False, only the histogram is plotted. If legend is True, titles, axis legend, etc. are plotted.
        :param granularity: an integer. The number of barplots in the histogram.
        """
        self.data_set = DataSet(path_to_data_set)
        self.data_set.loadDataSet()
        self.legend = legend
        self.granularity = granularity


    def Plot(self, col_nb):
        """
        The plotting function.
        :param col_nb: integer. The position of the column / feature to plot.
        """
        feature = self.data_set.extractColumn(col_nb=col_nb, convert_to_float=True)[1:]
        houses = self.data_set.extractColumn(col_nb=1)[1:]

        to_plot = {}

        for i in range(len(houses)):
            try:
                to_plot[houses[i]] += [feature[i]]
            except:
                to_plot[houses[i]] = [feature[i]]


        full_list = []
        unique_houses = set(houses)
        for house in unique_houses:
            full_list += to_plot[house]

        s = Statistics(full_list)
        min = s.Quartile(0)
        max = s.Quartile(1)
        bins = np.linspace(min, max, self.granularity)

        colors = {
            'Hufflepuff':'c',
            'Ravenclaw':'orange',
            'Slytherin':'g',
            'Gryffindor':'r',
        }

        for house in unique_houses:
            plt.hist(to_plot[house], bins, alpha=0.5, label=house, color=colors[house])

        if self.legend :
            plt.legend(loc='upper right')
            plt.title('Histogram of "%s" grades among the different Hogwarts houses' % self.data_set.data_set[0][col_nb])
            plt.xlabel("Grade")
            plt.ylabel("Count")

Ejemplo n.º 12

0

Mostrar archivo

class ScatterPlotPerHouse:
    """
    - A class to plot the scatter plot of a Hogwarts feature vs. another one
    - Example to run:
        from scatter_plot import ScatterPlotPerHouse
        import matplotlib.pyplot as plt
        sc = ScatterPlotPerHouse()
        sc.Plot(8,9)
        plt.show()
    """
    def __init__(self,
                 path_to_data_set='resources/dataset_train.csv',
                 legend=True,
                 size=10):
        """
        :param path_to_data_set: a string. The path to the dataset.
        :param legend: a boolean. If legend is False, only the histogram is plotted. If legend is True, titles, axis legend, etc. are plotted.
        :param size: an int. The size of the points in the scatter plot.
        """
        self.data_set = DataSet(path_to_data_set)
        self.data_set.loadDataSet()
        self.legend = legend
        self.size = size

    def Plot(self, col_nb_1, col_nb_2):
        """
        Plotting function
        :param col_nb_1: integer. The position of the 1st column / feature to plot.
        :param col_nb_2: integer. The position of the 2nd column / feature to plot.
        """
        feature_1 = self.data_set.extractColumn(col_nb=col_nb_1,
                                                convert_to_float=True)[1:]
        feature_2 = self.data_set.extractColumn(col_nb=col_nb_2,
                                                convert_to_float=True)[1:]
        houses = self.data_set.extractColumn(col_nb=1)[1:]

        to_plot = {
            'feature_1': {},
            'feature_2': {},
        }

        for i in range(len(houses)):
            if feature_1[i] and feature_2[i]:
                try:
                    to_plot['feature_1'][houses[i]] += [feature_1[i]]
                except:
                    to_plot['feature_1'][houses[i]] = [feature_1[i]]

                try:
                    to_plot['feature_2'][houses[i]] += [feature_2[i]]
                except:
                    to_plot['feature_2'][houses[i]] = [feature_2[i]]

        unique_houses = set(houses)
        colors = {
            'Hufflepuff': 'c',
            'Ravenclaw': 'orange',
            'Slytherin': 'g',
            'Gryffindor': 'r',
        }

        for house in unique_houses:
            plt.scatter(x=to_plot['feature_1'][house],
                        y=to_plot['feature_2'][house],
                        c=colors[house],
                        alpha=0.5,
                        label=house,
                        s=self.size)

        if self.legend:
            plt.legend(loc='upper right')
            plt.title(
                'Scatter plot of "%s" vs "%s" grades among the different Hogwarts houses'
                % (self.data_set.data_set[0][col_nb_1],
                   self.data_set.data_set[0][col_nb_2]))
            plt.xlabel(self.data_set.data_set[0][col_nb_1])
            plt.ylabel(self.data_set.data_set[0][col_nb_2])

Ejemplo n.º 13

0

Mostrar archivo

class PairPlot:
    """
    - A class to plot the pair plot of many Hogwarts features.
    - Example to run:
        from pair_plot import PairPlot
        import matplotlib.pyplot as plt
        pp = PairPlot()
        pp.Plot()
        plt.show()
    """
    def __init__(self,
                 path_to_data_set='resources/dataset_train.csv',
                 max_nb_features=4,
                 fig_size=(8, 8)):
        """
        :param path_to_data_set: a string. The path to the dataset.
        :param max_nb_features: an integer. The number of features to analyze - analysis will start from the first feature (on the left) and continue until reaching the number max of features.
                                This was necessary for the sake of readability (there are ~10 numeric features, which would lead to 10**2 = 100 plots to do.
        :param fig_size: an integer tuple. The size of the figure to output.
        """
        self.path_to_data_set = path_to_data_set
        self.data_set = DataSet(self.path_to_data_set)
        self.data_set.loadDataSet()
        self.max_nb_features = max_nb_features
        self.fig_size = fig_size
        self.numeric_features = []

    def extractNumericFeatures(self):
        """
        Automatically extracts the numeric features in the dataset.
        """
        for i in range(len(self.data_set.data_set[0])):
            if self.data_set.data_set[0][
                    i] != 'Index' and self.data_set.isNumericFeature(i):
                self.numeric_features += [i]

    def Plot(self):
        """
        Plotting function.
        :return:
        """
        plt.figure(figsize=self.fig_size)
        SMALL_SIZE = 5
        plt.rc('xtick', labelsize=SMALL_SIZE)
        plt.rc('ytick', labelsize=SMALL_SIZE)

        plt.suptitle("Pair Plot")

        N = len(self.numeric_features[:self.max_nb_features])

        for i in range(N):
            for j in range(N):
                ax = plt.subplot(N, N, 1 + j + i * N)
                if i == 0:
                    ax.xaxis.set_label_position('top')
                    plt.xlabel(
                        self.data_set.data_set[0][self.numeric_features[j]],
                        fontsize=8)
                if j == 0:
                    plt.ylabel(
                        self.data_set.data_set[0][self.numeric_features[i]],
                        fontsize=8)
                if i == j:
                    h = HistogramPerHouse(
                        path_to_data_set=self.path_to_data_set,
                        legend=False,
                        granularity=30)
                    h.Plot(self.numeric_features[i])
                else:
                    sc = ScatterPlotPerHouse(
                        path_to_data_set=self.path_to_data_set,
                        legend=False,
                        size=1)
                    sc.Plot(self.numeric_features[j], self.numeric_features[i])

        handles, labels = ax.get_legend_handles_labels()
        plt.figlegend(handles, labels, loc='lower right', prop={'size': 6})

Ejemplo n.º 14

0

Mostrar archivo

def main():

    dirname = os.path.dirname(__file__)
    dirname_prediction = os.path.join(dirname, 'results')

    file_name = sys.argv[1]
    file_name = os.path.join(dirname, file_name)

    d = DataSet(file_name)
    d.loadDataSet()

    to_remove = [
        d.data_set[0].index('Index'),
        d.data_set[0].index('First Name'),
        d.data_set[0].index('Last Name'),
        d.data_set[0].index('Birthday'),
        d.data_set[0].index('Best Hand'),
        d.data_set[0].index('Hogwarts House'),

        # Tests 7/10/18
        d.data_set[0].index('Arithmancy'),
        d.data_set[0].index('Defense Against the Dark Arts'),
        d.data_set[0].index('Divination'),
        d.data_set[0].index('Muggle Studies'),
        d.data_set[0].index('History of Magic'),
        d.data_set[0].index('Transfiguration'),
        d.data_set[0].index('Potions'),
        d.data_set[0].index('Care of Magical Creatures'),
        d.data_set[0].index('Charms'),
        d.data_set[0].index('Flying'),
    ]

    index_position = d.data_set[0].index('Index')
    indexes = np.array(
        [d.data_set[i][index_position] for i in range(len(d.data_set))])[1:]

    X = np.array([[
        d.data_set[i][j] for j in range(len(d.data_set[0]))
        if j not in to_remove
    ] for i in range(len(d.data_set))])
    #features = X[0,:]
    X = convert_to_float(X[1:, ])

    m = MeanImputation(X,
                       path_to_mean_imputation=os.path.join(
                           dirname_prediction, 'mean_imputation.json'))
    m.transform()

    sc = Scaling(X,
                 path_to_scaling=os.path.join(dirname_prediction,
                                              'scaling.json'))
    sc.transform()

    l = LogisticRegression(X=X,
                           path_to_beta=os.path.join(dirname_prediction,
                                                     'beta.json'))
    predictions = l.predict()

    dirname = os.path.dirname(__file__)
    file_name = os.path.join(dirname, 'resources/houses.csv')
    with open(file_name, 'w+') as outfile:
        writer = csv.writer(outfile, delimiter=',')
        writer.writerow(['Index', 'Hogwarts House'])
        for i in range(len(indexes)):
            writer.writerow([indexes[i], predictions[i]])

Ejemplo n.º 15

0

Mostrar archivo

Archivo: logreg_fine_tune_sgd.py Proyecto: SachaIZADI/DisruptHogwarts

def main():

    file_name = sys.argv[1]
    dirname = os.path.dirname(__file__)
    file_name = os.path.join(dirname, file_name)

    d = DataSet(file_name)
    d.loadDataSet()

    to_remove = [
        d.data_set[0].index('Index'),
        d.data_set[0].index('First Name'),
        d.data_set[0].index('Last Name'),
        d.data_set[0].index('Birthday'),
        d.data_set[0].index('Best Hand'),
        d.data_set[0].index('Hogwarts House'),

        # Tests 7/10/18
        d.data_set[0].index('Arithmancy'),
        d.data_set[0].index('Defense Against the Dark Arts'),
        d.data_set[0].index('Divination'),
        d.data_set[0].index('Muggle Studies'),
        d.data_set[0].index('History of Magic'),
        d.data_set[0].index('Transfiguration'),
        d.data_set[0].index('Potions'),
        d.data_set[0].index('Care of Magical Creatures'),
        d.data_set[0].index('Charms'),
        d.data_set[0].index('Flying'),
    ]

    X = np.array([[
        d.data_set[i][j] for j in range(len(d.data_set[0]))
        if j not in to_remove
    ] for i in range(len(d.data_set))])
    X = convert_to_float(X[1:, ])

    y_col_nb = d.data_set[0].index('Hogwarts House')
    y = np.array(d.extractColumn(y_col_nb)[1:])

    m = MeanImputation(X)
    m.train()
    m.transform()

    sc = Scaling(X)
    sc.train()
    sc.transform()

    sp = SplitTrainTest(X, y)
    sp.Split()
    X_train = sp.X_train
    y_train = sp.y_train
    X_test = sp.X_test
    y_test = sp.y_test

    l = LogisticRegression(X=X_train,
                           y=y_train,
                           optimizer='sgd',
                           optimizer_params={
                               'alpha': 0.5,
                               'n': 5,
                               'batch_size': 16
                           })
    l.train()

    y_predicted = l.predict()

    cm1 = ConfusionMatrix(y_train, y_predicted)
    cm1.getMatrix()
    print('\n\n')
    print(
        '**************** Confusion Matrix on the training set ****************'
    )
    print('\n')
    cm1.Print()

    y_predicted = l.predict(X_test)

    cm2 = ConfusionMatrix(y_test, y_predicted, cm1.unique_labels)
    cm2.getMatrix()
    print('\n\n')
    print(
        '**************** Confusion Matrix on the testing set ****************'
    )
    print('\n')
    cm2.Print()