Ejemplo n.º 1
0
    def predict(self, X_test):
        """Makes predictions for test instances in X_test.

        Args:
            X_test(list of list of numeric vals): The list of testing samples
                The shape of X_test is (n_test_samples, n_features)

        Returns:
            y_predicted(list of obj): The predicted target y values (parallel to X_test)
        """
        distances, indices = self.kneighbors(X_test)
        neighbors = []
        y_predicted = []

        #loops through each X_test
        for i in range(len(X_test)):
            neighbors = []

            #loops through indices and adds y_train values to neighbors
            for j in range(len(indices[i])):
                neighbors.append(self.y_train[indices[i][j]])
        
            #gets frequencies of neighbors and y value with highest count to set as y predicted
            values, counts = myutils.get_frequencies(neighbors)
            max_val = max(counts)
            max_index = counts.index(max_val)
            prediction = values[max_index]  
            y_predicted.append(prediction)  

        #returns array of y-predicted values
        return y_predicted
Ejemplo n.º 2
0
    def predict(self, X_test):
        """Makes predictions for test instances in X_test.

        Args:
            X_test(list of list of numeric vals): The list of testing samples
                The shape of X_test is (n_test_samples, n_features)
                
        Returns:
            y_predicted(list of obj): The predicted target y values (parallel to X_test)
        """
        y_predicted = []
        header = []
        n_features = len(X_test[0])
        for val_num in range(n_features):
            col_name = "col_" + str(val_num)
            header.append(col_name)
        header.append("class label")
        header.append("index")
        header.append("distance")

        for test in X_test:
            for i, instance in enumerate(self.X_train):
                # append the class label
                instance.append(self.y_train[i])
                # append the original row index
                instance.append(i)
                # append the distance to test
                if (type(X_test[0][0]) == int or type(X_test[0][0]) == float):
                    dist = myutils.compute_euclidean_distance(
                        instance[:n_features], test)
                else:
                    dist = myutils.compute_categorical_distance(
                        instance[:n_features], test)
                instance.append(dist)

            # sort X_train
            train_sorted = sorted(self.X_train, key=operator.itemgetter(-1))

            # grab the top k
            top_k = train_sorted[:self.n_neighbors]
            top_k_table = mypytable.MyPyTable(header, top_k)

            values, counts = myutils.get_frequencies(top_k_table.data,
                                                     top_k_table.column_names,
                                                     "class label")
            highest_val_count = max(counts)
            highest_val_index = counts.index(highest_val_count)
            y_predicted.append(values[highest_val_index])

        return y_predicted
Ejemplo n.º 3
0
    def predict(self, X_test):
        """Makes predictions for test instances in X_test.

        Args:
            X_test(list of list of obj): The list of testing samples
                The shape of X_test is (n_test_samples, n_features)

        Returns:
            y_predicted(list of obj): The predicted target y values (parallel to X_test)
        """
        y_predicted = []
        weights = []
        values, counts = myutils.get_frequencies(self.y_train)
        for i in range(len(self.y_train)):
            for j in range(len(values)):
                if (self.y_train[i] == values[j]):
                    weights.append(counts[j])
        y_predicted = random.choices(self.y_train, weights=weights, k=len(X_test))
    
        return y_predicted # TODO: fix this
Ejemplo n.º 4
0
    def fit(self, X_train, y_train):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """
         # fit() accepts X_train and y_train
        # TODO: calculate the attribute domains dictionary

        header = []
        attribute_domains = {}
        
        #loops through X_train and creates header
        for i in range(len(X_train[0])) :
            header.append("att" + str(i))

        #loops though header to form attribute domains dictionairy
        count = 0
        for item in header:
            curr_col = myutils.get_column(X_train, count)
            values, counts = myutils.get_frequencies(curr_col)
            attribute_domains[item] = values
            count+=1

        #stitching together X_train and y_train and getting available attributes
        train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
        available_attributes = header.copy()
    
        #forming tree
        self.tree = myutils.tdidt(train, available_attributes, attribute_domains, header)
        self.print_decision_rules()
    def predict(self, X_test):
        """Makes predictions for test instances in X_test.
        Args:
            X_test(list of list of numeric vals): The list of testing samples
                The shape of X_test is (n_test_samples, n_features)
                
        Returns:
            y_predicted(list of obj): The predicted target y values (parallel to X_test)
        """
        y_predicted = []
        distances, indices = self.kneighbors(X_test)
        for row in indices:
            knn_classifiers = []
            for index in row:
                knn_classifiers.append(self.y_train[index])
            values, freqs = myutils.get_frequencies(knn_classifiers)
            prediction_index = freqs.index(max(freqs))
            prediction = values[prediction_index]
            y_predicted.append(prediction)

        return y_predicted  # TODO: copy your solution from PA4 here
Ejemplo n.º 6
0
    def predict(self, X_test):
        """ Uses majority voting in the decision forest to predict y values

        Args:
            X_test(list of list of obj): The list of testing samples

        Returns:
            y_predicted(list of obj): The predicted target y values (parallel to X_test)

        Notes:
            
        """
        y_predicted = []
        for i in range(len(X_test)):
            y_temp = []
            for tree in self.trees:
                temp = tree.predict(X_test)
                y_temp.append(temp[0])
            values, value_sums = myutils.get_frequencies(y_temp)
            max_val = max(value_sums)
            max_index = value_sums.index(max_val)
            y_predicted.append(values[max_index])
        
        return y_predicted
 def fit(self, X_train, y_train):
     """Fits a Naive Bayes classifier to X_train and y_train.
     Args:
         X_train(list of list of obj): The list of training instances (samples). 
             The shape of X_train is (n_train_samples, n_features)
         y_train(list of obj): The target y values (parallel to X_train)
             The shape of y_train is n_train_samples
     Notes:
         Since Naive Bayes is an eager learning algorithm, this method computes the prior probabilities
             and the posterior probabilities for the training data.
         You are free to choose the most appropriate data structures for storing the priors
             and posteriors.
     """
     self.X_train = X_train
     self.y_train = y_train
     self.priors = []
     self.posteriors = []
     header = []
     X_train_copy = X_train.copy()
     for i in range(len(X_train_copy)):
         X_train_copy[i].append(y_train[i])
     for i in range(len(X_train_copy[0])):
         header.append(str(i + 1))
     classifier_names, classifier_subtables = myutils.group_by(
         X_train_copy, header, str(len(X_train_copy[0])))
     self.priors.append(classifier_names)
     for subtable in classifier_subtables:
         self.priors.append(len(subtable) / len(X_train_copy))
     posteriors_header = []
     #print(self.priors)
     for i in range(len(X_train_copy[i]) - 1, 0, -1):
         temp_names, temp_subtables = myutils.group_by(
             X_train_copy, header, str(i))
         for name in temp_names:
             posteriors_header.append(name)
     posteriors_row = []
     for i in range(len(posteriors_header) + 1):
         for j in range(len(classifier_names) + 1):
             posteriors_row.append(0)
         self.posteriors.append(posteriors_row)
         posteriors_row = []
     self.posteriors[0][0] = "label"
     for i in range(len(self.posteriors[0]) - 1):
         self.posteriors[0][i + 1] = classifier_names[i]
     for i in range(1, len(self.posteriors)):
         self.posteriors[i][0] = str(posteriors_header[i - 1])
     for k in range(len(classifier_subtables)):
         header_col = myutils.get_column(self.posteriors,
                                         self.posteriors[0],
                                         self.posteriors[0][0])
         for i in range(len(header) - 1):
             col = myutils.get_column(classifier_subtables[k], header,
                                      header[i])
             values, counts = myutils.get_frequencies(col)
             for j in range(len(counts)):
                 row_index = header_col.index(str(values[j]))
                 header_col[row_index] = 0
                 col_index = self.posteriors[0].index(classifier_names[k])
                 self.posteriors[row_index][col_index] = counts[j] / len(
                     classifier_subtables[k])
     pass  # TODO: copy your solution from PA5 here
Ejemplo n.º 8
0
    def fit(self, X_train, y_train):
        """Fits a decision tree classifier to X_train and y_train using the TDIDT (top down induction of decision tree) algorithm.

        Args:
            X_train(list of list of obj): The list of training instances (samples). 
                The shape of X_train is (n_train_samples, n_features)
            y_train(list of obj): The target y values (parallel to X_train)
                The shape of y_train is n_train_samples

        Notes:
            Since TDIDT is an eager learning algorithm, this method builds a decision tree model
                from the training data.
            Build a decision tree using the nested list representation described in class.
            Store the tree in the tree attribute.
            Use attribute indexes to construct default attribute names (e.g. "att0", "att1", ...).
        """
         # fit() accepts X_train and y_train
        # TODO: calculate the attribute domains dictionary

        if (self.seed != None):
            random.seed(self.seed)
            
        n_trees = []
        accuracies = []
        for i in range(self.N):
            header = []
            attribute_domains = {}
            
            #loops through X_train and creates header
            for i in range(len(X_train[0])) :
                header.append("att" + str(i))
            

            #loops though header to form attribute domains dictionairy
            count = 0
            for item in header:
                curr_col = myutils.get_column(X_train, count)
                values, counts = myutils.get_frequencies(curr_col)
                attribute_domains[item] = values
                count+=1
                

            #stitching together X_train and y_train and getting available attributes
            train = [X_train[i] + [y_train[i]] for i in range(len(X_train))]
            available_attributes = header.copy()

            boot_train = myutils.compute_bootstrapped_sample(train)

            validation_set = []
            for row in train:
                if row not in boot_train:
                    validation_set.append(row)



            #forming tree
            tree = myutils.tdidt_forest(boot_train, available_attributes, attribute_domains, header, self.F)
            #print(tree)

            tree_dict = {}
            tree_dict["tree"] = tree
            y_test = []
            for row in validation_set:
                y_test.append(row.pop())
            
            y_predict = myutils.predict_tree(validation_set, tree)

            acc = myutils.get_accuracy(y_predict, y_test)
            tree_dict["acc"] = acc
            n_trees.append(tree_dict)
        

        sorted_trees = sorted(n_trees, key=lambda k: k['acc'], reverse=True)
        for i in range(self.M):
            self.trees.append(sorted_trees[i]["tree"])