def single_split(data, estimator, scoring): attrs, classes = utils.horizontal_split(data) X_train, X_test, y_train, y_test = train_test_split(attrs, classes, test_size=0.4) estimator.fit(X_train, y_train) scorer = get_scorer(scoring) return scorer(estimator, X_test, y_test)
def load(file): iterator = csv.reader(open(file, "r")) data = list(iterator) class_index = utils.get_class_index(data) for i in range(len(data)): for j in range(len(data[i])): val = data[i][j] data[i][j] = float(val) if j != class_index else val return utils.horizontal_split(data)
def get_attr_by_class_probs(self, X, y): data = utils.merge_attrs(X, y) class_index = utils.get_class_index(data) result = self.get_empty_classes_dict([]) for record in data: class_key = record[class_index] result[class_key].append(record) for key in result: class_X, class_y = utils.horizontal_split(result[key]) result[key] = self.get_attr_probs(class_X, self.empty_bins()) return result
def score(data, estimator_const, est_param, k=10, cross_val=True, stratified=True): attrs, classes = utils.horizontal_split(data) attr_ranges = utils.attr_ranges(attrs) unique_classes = utils.unique_classes( classes ) # it's important to recognize classes from both training and test set(whole data) estimator = estimator_const(unique_classes, est_param, attr_ranges) scoring = ['f1_macro', 'accuracy', 'precision_macro', 'recall_macro'] score = validation.k_fold(data, estimator, scoring, k, stratified) \ if cross_val else validation.single_split(data, estimator, scoring) print(("mean " if cross_val else "") + f"score: {score}") return score
def k_fold(data, estimator, scoring, k, stratified=True): attrs, classes = utils.horizontal_split(data) cv = StratifiedKFold(n_splits=k) if stratified else KFold(n_splits=k) scores = cross_validate(estimator, attrs, classes, cv=cv, scoring=scoring) scores_test = [scores["test_" + scoring_elem] for scoring_elem in scoring] return np.mean(scores_test, (0, 1)) # mean by metrics then by folds