Ejemplo n.º 1
0
 def _naive_bayes(self,
                  data: mldata.ExampleSet) -> Mapping[mldata.Feature, Any]:
     model_parameters = dict()
     labels = mlutil.get_labels(data)
     label = mlutil.get_label_info(data)
     model_parameters[label] = self._compute_probability(labels)
     feature_examples = mlutil.get_feature_examples(data, as_dict=True)
     for feature, examples in feature_examples.items():
         exs = self._get_feature_values(feature, examples)
         model_parameters[feature] = self._compute_probability(exs, labels)
     return model_parameters
Ejemplo n.º 2
0
    def train(self, data):  # initialize example weights to 1/N. where each data entry is a feature
        weights = [1 / len(data)] * len(data)  # initialize example weights to 1/N. where each data entry is a feature
        next_weights = [1 / len(data)] * len(data)

        i = 0
        epsilon = 0.5
        accuracies = []
        model = []  # the models themselves
        cweights = []  # the classifier weights
        # run model and get predictions
        # note that we're training and testing on the same data
        label = mlutil.get_labels(data)
        labels = [f * 2 - 1 for f in label]
        while i < self.iterations and 0 < epsilon <= 0.5:
            i = i + 1
            classifier = 0
            classifier_weight = 0
            epsilon = 0
            acc = 0
            # run through all algorithms we want per iteration
            for e, alg in enumerate(self.algorithm):
                weights = copy(next_weights)
                mod = weighted_model(data, weights, alg=alg)
                pred = mod.predict(data)
                # this returns scores for the models, and labels for decision tree
                weights1, classifier_weight1, epsilon1, acc1 = weighted_error(pred, labels, weights)  # find new example weights and classifier weight
                if self.experiment == 2: # experiment where we use all classifiers
                    model.append(mod)
                    cweights.append(classifier_weight1)
                    accuracies.append(acc)
                if acc1 > acc:
                    next_weights = weights1
                    classifier_weight = classifier_weight1
                    epsilon = epsilon1
                    acc = acc1
                    classifier = mod
            if self.experiment < 2:
                model.append(classifier)
                cweights.append(classifier_weight)
                accuracies.append(acc)

        self.model = model
        self.cweights = cweights
        self.accuracies = accuracies
Ejemplo n.º 3
0
    def _get_best_feature_and_test(self, data: mldata.ExampleSet) -> Tuple:
        labels = mlutil.get_labels(data)
        feature_exs = mlutil.get_feature_examples(data)
        split_tests = mlutil.create_all_split_tests(data)
        l_type = mlutil.get_label_info(data).type
        label_tests = mlutil.create_split_tests(labels, l_type, as_tuple=False)

        # finds the information gain or gain ratio of each test
        split_values = [
            [
                self.split_function(labels, label_tests, f, [t],
                                    self.partitions)
                # get the tests for the ith feature
                for t in split_tests[i]
            ] for i, f in enumerate(feature_exs)
        ]
        i_max_feature = int(np.argmax([max(v) for v in split_values]))
        i_max_test = np.argmax(split_values[i_max_feature])
        best_test = split_tests[i_max_feature][i_max_test]
        # ID feature not considered when generating feature tests (add 1)
        best_feature = data.schema[i_max_feature + 1]
        return best_feature, best_test
Ejemplo n.º 4
0
def cross_validate(
		learner: model.Model,
		data: mldata.ExampleSet,
		n_folds: int,
		save_as: str = None) -> Tuple[Tuple, Tuple]:
	"""Performs stratified cross validation on a general learner and data set.

	Args:
		learner: Model instance whose task is train on the data.
		data: Full set of training examples.
		n_folds: Number of folds to perform cross validation.
		save_as: Base name of the file. Will be appended with the fold
			iteration (if there are multiple folds) and .txt.

	Returns:
		Two tuples, the first being the predictions of from each fold,
		and then second being the corresponding labels.
	"""
	fold_predictions = []
	fold_test_labels = []
	if n_folds < 1:
		raise ValueError('Minimum number of folds is 1 (full dataset)')
	folds = get_folds(data, n_folds)
	for i in range(n_folds):
		if n_folds == 1:
			train = data
			test = data
		else:
			train, test = get_train_test_split(folds, i)
		learner.fold = i + min(n_folds, 2) - 1
		learner.train(train)
		if save_as:
			if n_folds == 1:
				learner.save(f'{save_as}.txt')
			else:
				learner.save(f'{save_as}_{i + 1}.txt')
		fold_predictions.append(learner.predict(test))
		fold_test_labels.append(mlutil.get_labels(test))
	return tuple(fold_predictions), tuple(fold_test_labels)
Ejemplo n.º 5
0
def get_folds(dataset: mldata.ExampleSet, n_folds: int) -> Tuple:
	"""Creates n stratified folds from the dataset.

	For each unique value of the label, examples from the dataset are
	assigned to a fold round robin style. It may be the case that some folds
	have more training examples than others, depending  on the number of
	folds and number of examples in the dataset.

	Args:
		dataset: Collection of training examples on which to generate folds.
		n_folds: Number of folds to create.

	Returns:
		Tuple of lists, where each list is a fold of examples.
	"""
	folds = defaultdict(list)
	labels = mlutil.get_labels(dataset)
	for label in set(labels):
		examples = [e for e, lab in zip(dataset, labels) if lab == label]
		random.shuffle(examples)
		num_folds = max(1, n_folds)
		for i, example in enumerate(examples):
			folds[i % num_folds].append(example)
	return tuple(folds.values())