def draw(clf, ds: DataSet, step): X = ds.get_X() y = ds.get_y() x_min, y_min = np.amin(X, 0) x_max, y_max = np.amax(X, 0) xx, yy = np.meshgrid(np.arange(x_min, x_max, step), np.arange(y_min, y_max, step)) grid = np.c_[xx.ravel(), yy.ravel()] predict_z = np.array(pm_predict(clf.predict_single, grid, name='predict')).reshape(xx.shape) x0, y0 = X[y == -1].T x1, y1 = X[y == 1].T X_sup = X[clf.support_indices] x_sup, y_sup = X_sup.T def plot(_predict_z): plt.figure(figsize=(10, 10)) plt.pcolormesh(xx, yy, _predict_z, cmap=plt.get_cmap('seismic'), shading='auto') plt.scatter(x0, y0, color='red', s=100) plt.scatter(x1, y1, color='blue', s=100) plt.scatter(x_sup, y_sup, color='white', marker='x', s=60) plt.show() plot(predict_z)
def choose_best(ds: DataSet): gs = GridSearchCV(estimator=SMO(), param_grid=GRID, cv=4, scoring='accuracy', verbose=1, n_jobs=-1) gs.fit(ds.get_X(), ds.get_y()) draw_metrics(gs.cv_results_) print(f'Got best score {gs.best_score_} with params {gs.best_params_}') return gs.best_params_
def test(ds: DataSet, name): print("Metric") train_ds, test_ds = ds.test_train_split(test_size=0.33) metric_data = {"test": [], "train": []} def add_metric(ds, clf, ds_name): metric_data[ds_name].append(accuracy_score(ds.y, clf.predict(ds.X))) def clbck(clf, step): add_metric(test_ds, clf, "test") add_metric(train_ds, clf, "train") clf = AdaBoost(n_estimator=STEPS, callback=clbck, verbose=True) clf.fit(train_ds.X, train_ds.y) metric_plot(metric_data, x_label='Steps', x_values=list(range(1, STEPS + 1)), title=f'Accuracy for {name}', default_color=True)
def read_dataset(filename) -> DataSet: data = pd.read_csv(filename) X = data.values[:, :-1] tmp_y = data.values[:, -1] y = np.vectorize(lambda t: 1 if t == 'P' else -1)(tmp_y) return DataSet(X, y)
import os import csv from utils import common from config import configer from utils.data_set import DataSet, get_important_subject def get_run_path(): return os.path.dirname(os.path.realpath(__file__)) if __name__ == "__main__": configer.load_data(get_run_path()) # 获取专业数据 A = DataSet("2014-2015_计科.csv") B = DataSet("2016-2017_计科.csv") C = DataSet("2018-2019_计科.csv") A.get_special_data() B.get_special_data() C.get_special_data() # 获取课程交集 important = get_important_subject(A.get_column("ZWMC"), B.get_column("ZWMC"), C.get_column("ZWMC")) with open( os.path.join(configer.get_value("dataset_path"), "csv/important_sub.txt"), "w+") as w: w.write(str(important)) print("ok")