def fit(j, testing=None, coef=False): # Set training data and target data X = np.array(data.loc[1:, np.delete(data.columns.values, 0)]) Y = np.array(data.loc[1:, ['five_star']]).ravel() # Assign the training/testing sets X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1 - j) # Assign the testing set to the new vars if necessary if testing is None: testing = [X_test, Y_test] # Instantiate binary classification logistic regression model logreg = linear_model.LogisticRegression(C=1e5) # Fit model logreg.fit(X_train, Y_train) # Set hypothesis and true target data h = logreg.predict(testing[0]) y = testing[1] # Return the coefficient matrix if necessary if coef: return (h, y, error_rate(h, y), logreg.coef_) return (h, y, error_rate(h, y))
def runKNN(): docs_train, docs_test, y_train, y_test = train_test_split( data['tweet'], data['mvmt'], test_size=0.25, random_state=None) pipeline = Pipeline([ ('vect', TfidfVectorizer(min_df=3, max_df=0.95)), ('clf', KNeighborsClassifier(n_neighbors=3)), ]) # Fit the pipeline on the training set using grid search for the parameters parameters = { 'vect__ngram_range': [(1, 1), (1, 2)], } grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1) grid_search.fit(docs_train, y_train) n_candidates = len(grid_search.cv_results_['params']) for i in range(n_candidates): print( i, 'params - %s; mean - %0.2f; std - %0.2f' % (grid_search.cv_results_['params'][i], grid_search.cv_results_['mean_test_score'][i], grid_search.cv_results_['std_test_score'][i])) # TASK: Predict the outcome on the testing set and store it in a variable # named y_predicted y_predicted = grid_search.predict(docs_test) # Print the classification report print(metrics.classification_report(y_test, y_predicted)) # Print and plot the confusion matrix cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) plt.matshow(cm) plt.show()
def run_main(): file_df = pd.read_csv('../dataset/voice.csv') # print file_df insect_dataset(file_df) #填充空数据 drop_na(file_df) #查看label的个数 分组显示 # print file_df['label'].value_counts() #特征分布可视化 fea_name1 = 'meanfun' fea_name2 = 'centroid' #两个属性的特征图 # visaulize_two_feature(file_df,fea_name1,fea_name2) #艺术性属性的特征图 # visaulize_single_feature(file_df,fea_name1) #多个特征 fea_name = ['meanfreq', 'Q25', 'Q75', 'skew', 'centroid', 'label'] # visaulize_muilt_feature(file_df,fea_name) X = file_df.iloc[:, :-1].values file_df['label'].replace('male', 0, inplace=True) file_df['label'].replace('female', 1, inplace=True) y = file_df['label'].values #特征归一化 X = preprocessing.scale(X) #分割训练集,测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 / 3., random_state=5) #选择模型 交叉验证 cv_scores = [] k_range = range(1, 31) for k in k_range: knn = KNeighborsClassifier(k) # print 'knn:',knn scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy') score_mean = scores.mean() cv_scores.append(score_mean) print '%i:%.4f' % (k, score_mean) best_k = np.argmax(cv_scores) + 1 #训练模型 knn_model = KNeighborsClassifier(best_k) knn_model.fit(X_train, y_train) print '测试模型,准确率:', knn_model.score(X_test, y_test) return ''
def _train(self) -> Model: data, label = load_data0_cycle() train_data, test_data, train_label, test_label = train_test_split( data, label, test_size=0.2) train_data = np.reshape(train_data, train_data.shape + (1, )) train_label = to_categorical(train_label) test_data = np.reshape(test_data, test_data.shape + (1, )) test_label = to_categorical(test_label) network_input = Input(shape=(8, 200, 1)) # 如果这里修改了网络结构,记得去下面修改可视化函数里的参数 network = Conv2D(filters=20, kernel_size=(1, 10))(network_input) network = Conv2D(filters=40, kernel_size=(4, 10), activation=tanh)(network) network = MaxPool2D((2, 2))(network) network = Flatten()(network) network = Dense(units=40, activation=tanh)(network) network = Dense(units=10, activation=softmax)(network) network = Model(inputs=[network_input], outputs=[network]) network.compile(optimizer=RMSprop(), loss=categorical_crossentropy, metrics=[categorical_accuracy]) network.summary() self.train_history = network.fit(train_data, train_label, batch_size=32, epochs=16) self.evaluate_history = network.evaluate(test_data, test_label) return network
def get_train(): corpus = get_vec() labels = get_labels() X_train,X_test, y_train, y_test = train_test_split(corpus, labels, test_size=0.2) return X_train, y_train, X_test, y_test
def base_linear_model(dataframe, device_parameter): X_train, X_validation, y_train, y_validation = train_test_split( dataframe, device_parameter, test_size=0.25, random_state=42, shuffle=True) # Fitting the data into linear regression model lin_reg = LinearRegression() lin_reg.fit(X_train, y_train) # Evaluation of base linear regression model t_pred = lin_reg.predict(X_train) y_pred = lin_reg.predict(X_validation) # Calculating mean square error on training and testing train_mse = mean_squared_error(y_train, t_pred) test_mse = mean_squared_error(y_validation, y_pred) print("Training mean squared error: ", train_mse) print("Testing mean squared error: ", test_mse) # Plotting results of linear regression base model fig, ax = plt.subplots() ax.scatter(y_pred, y_validation, edgecolors=(0, 0, 1)) ax.plot([y_validation.min(), y_validation.max()], [y_validation.min(), y_validation.max()], 'r--', lw=3) ax.set_xlabel('Predicted') ax.set_ylabel('Actual') plt.show()
def process_gsc_data(value): if (value == 0): fetched_data = extract_human_data_sub() else: fetched_data = extract_human_data_con() fetched_data = pd.DataFrame(fetched_data) BigSigma = fetched_data.cov() fetched_data = fetched_data.loc[:, (BigSigma != 0).any(axis=0)] BigSigma = pd.DataFrame(fetched_data).cov() BigSigma = np.diag(np.diag(BigSigma)) BigSigma_inv = np.linalg.inv(BigSigma) fetched_data = fetched_data.values print(fetched_data.shape, BigSigma_inv.shape) train, test_and_val, train_out, test_and_val_out = train_test_split( fetched_data, target, test_size=0.3, shuffle=True) train = np.array(train) pivot = int(len(test_and_val) / 2) test = test_and_val[:pivot] val = test_and_val[pivot:] test_out = test_and_val_out[:pivot] val_out = test_and_val_out[pivot:] # print(len(fetched_data)) return train, test, val, train_out, test_out, val_out, BigSigma_inv
def __init__(self, **kwargs): super(BoschChallenge, self).__init__(path=None) stream = open("../data/train.yaml", "r") files = yaml.load(stream) df = pd.DataFrame(files) df['path'] = df['path'].apply(lambda x: '../data/' + x[x.find('/'):]) self.trainData, self.valData = train_test_split(df, test_size=.2)
def _iter_test_indices(self, X, y=None, groups=None): n = _num_samples(X) index = np.arange(n) train_index, test_index = train_test_split( index, test_size=self.test_size, random_state=self.random_state) yield test_index
def test_ml_pipeline(): 'load a test data set, run SVM on it, and plot the predictions vs the actual values' data, targets = ReactivityDataLoader().load_mopac_learning() regressor = SVR(C=1000) trainData, testData, trainTargets, testTargets = train_test_split(data, targets) regressor.fit(trainData, trainTargets) os.chdir(str(Path.home() / 'Desktop')) main.plotScatterPlot(testTargets, regressor.predict(testData), 'predictedVsActual')
def fit(self, X, y): trees = [] for index in range(self.n_estimators): tree = DecisionTreeClassifier() trees.append(tree) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=int(self.max_features)) trees[index].fit(X_train, y_train) return trees,y_test,X_test
def train_boost(booster, seed, oversampling=-1.0, use_tfidf=False, enable_cv=False, use_alldata=False, num_trees=-1): train, y, features = prepare_train() if use_tfidf: print 'Using raw tf-idf sparse matrix ... ' features = 'auto' train_sparse = sparse.csr_matrix(train.values) # tfidf_sparse = load_sparse_csr('tfidf_stem_train.npz') bm25_sparse = load_sparse_csr('bm25_train.npz') # bm25_sparse = bm25_sparse[404290 - 50000:, :] # train = sparse.hstack([train_sparse, tfidf_sparse]) # common_words = load_sparse_csr('train_tfidf_commonwords.npz') # symmdif = load_sparse_csr('train_tfidf_symmdiff.npz') train = sparse.hstack([train_sparse, bm25_sparse]) del train_sparse, bm25_sparse print 'Train shape: ', train.shape if enable_cv: train, y = shuffle(train, y) booster.cv(train, y) exit() if use_alldata: print 'Using all data to fit classifier ... ' assert num_trees > 0 results = booster.fit_all(train, y, num_trees, features) else: print 'Using train/dev split to fit classifier ... ' X_train, X_eval, y_train, y_eval = train_test_split(train, y, stratify=y, test_size=0.20, random_state=seed) if oversampling > 0: print 'Oversampling X_train, X_eval datasets ... ' X_train, y_train = oversample_sparse(X_train, y_train, p=oversampling) X_eval, y_eval = oversample_sparse(X_eval, y_eval, p=oversampling) results = booster.fit(X_train, X_eval, y_train, y_eval, features) y_pred = booster.predict(X_eval) print log_loss(y_eval, y_pred) print y_pred train = None y = None del train del y return results
def main(): samples = load_files("data") sequence_dim = 20 sequence_lag = 1 samples, labels = make_sequences(samples, sequence_dim, sequence_lag) model = Sequential() model.add(LSTM(128, input_shape=(sequence_dim, 2), return_sequences=True)) model.add(LSTM(128)) model.add(Dense(64)) model.add(Dense(2)) print(model.summary()) (trainSamples, testSamples, trainLabels, testLabels) = train_test_split(samples, labels, test_size=0.15, random_state=42) imname = "animal-11" image = cv2.imread("img/{}.jpg".format(imname)) # create ground truth image with all train gazes for j in range(len(trainLabels)): s = trainLabels[j] cv2.circle(image, (int(s[0]), int(s[1])), 10, (255, 0, 0), 3) cv2.imwrite("img/{}_truth.jpg".format(imname), image) model.compile(loss="mean_absolute_error", optimizer="adam", metrics=["mae"]) EPOCHS = 30 for e in range(EPOCHS): print("=" * 50) print("Iteration: {}".format(e)) model.fit(trainSamples, trainLabels, validation_data=(testSamples, testLabels), epochs=1, batch_size=128, verbose=1) predictions = model.predict(testSamples) # create and save image with all current predictions image = cv2.imread("img/{}.jpg".format(imname)) cv2.line(image, (0, 0), (200, 200), (255, 255, 255), 2) for p in predictions: cv2.circle(image, (int(p[0]), int(p[1])), 10, (0, 255, 0), 3) cv2.imwrite("img/{}_e{:02d}.jpg".format(imname, e), image) model.save("model_rnn.h5")
def get_SVM_classifier(datas, labels, split_size): # 拆分训练数据和测试数据 x_train, x_test, y_train, y_test = train_test_split(datas, labels, test_size=split_size) # 构建先修SVM对象并训练 clf = LinearSVC(C=1, loss="hinge").fit(datas, labels) print("Score of train datas:{0:.2%}".format(clf.score(x_train, y_train))) print("Score of train datas(split_size:{0}):{1:.2%}".format( split_size, clf.score(x_test, y_test))) return clf
def train(args, **kwargs): n_gaus_comp = args.ncomp kmeans_mu = kwargs.get('kmeans', False) X_train = toy_data(n_samples=10000) X_train, X_test, Y_train, Y_test = train_test_split(X_train, X_train, test_size=0.2, random_state=1) X_train, X_dev, Y_train, Y_dev = train_test_split(X_train, Y_train, test_size=0.2, random_state=1) input_size = X_train.shape[1] output_size = X_train.shape[1] batch_size = 1000 mus = np.random.randn(n_gaus_comp, 2).astype('float32') #mus = X_train[0:n_gaus_comp] raw_stds = None raw_cors = None model = NNModel(n_epochs=100000, batch_size=batch_size, input_size=input_size, output_size=output_size, early_stopping_max_down=10, n_gaus_comp=n_gaus_comp, mus=mus, sigmas=raw_stds, corxy=raw_cors) model.build() model.fit(X_train, Y_train, X_dev, Y_dev, X_test, Y_test) mus_eval, sigmas_eval, corxy_eval, pis_eval = model.f_predict(X_dev) mus_eval, sigmas_eval, corxy_eval, pis_eval = np.asarray( mus_eval), np.asarray(sigmas_eval), np.asarray(corxy_eval), np.asarray( pis_eval) logging.info(mus_eval) logging.info(sigmas_eval) pdb.set_trace()
def main(): df = pd.read_csv("data/titanic.csv") print(df) df.drop(['Name'], axis=1, inplace=True) df['Sex'] = df['Sex'].map({'male': 0, 'female': 1}) print(df) # print("probabilities of survival:") # for col in df.columns[1:]: # df2 = pd.crosstab(values=df.index, index=df['Survived'], columns=df[col], aggfunc='count') # print(df2) features = list(df.columns[1:]) labels = ['Survived', 'Not Survived'] data = df[df.columns[1:]].values.tolist() target = list(df['Survived'].map({True: 1, False: 0})) print(len(features), "Features: ", features) print(len(data), 'Data: ', data) print(len(target), 'Target: ', target) X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2) clf = RandomForestClassifier(n_estimators=100) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print("Accuracy of random forest model: ", metrics.accuracy_score(y_test, y_pred)) feature_imp = pd.Series(clf.feature_importances_, index=features).sort_values(ascending=True) #print(feature_imp) # Creating a bar plot sns.barplot(x=feature_imp, y=feature_imp.index) # Add labels to graph plt.xlabel('Feature Importance Score by Random Forest') plt.ylabel('Features') plt.legend() plt.show() # Predict probabilities for the test data. probs = clf.predict_proba(data) # Keep Probabilities of the positive class only. probs = probs[:, 1] # Compute the AUC Score. auc = roc_auc_score(target, probs) print('AUC: %.2f' % auc) # fpr, tpr, thresholds = roc_curve(y_test, probs) plot_roc_curve(clf, data, target) plt.show()
def check_generalization(pipe, metric, X, y, test_size=0.2, dishonnest_validation_mlp=False): ''' Check for bad generalization to avoid overfit of a pipe param :param pipe: :param metric: scoring method :param X: train and test :param y: train and test :warning :param dishonnest if True small leakage ''' x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size) # print([i[0] for i in pipe.steps]) # print([i[0] for i in pipe.named_steps.u_prep.transformer_list]) if dishonnest_validation_mlp: assert hasattr( pipe.named_steps.clf, "validation" ), "no attribute validation, pipe's clf is no instance of CustomNNCategorical" prep_val_pipe = deepcopy(pipe) prep_val_pipe.steps = prep_val_pipe.steps[: -1] # only keep the preprocessing prep_val_pipe = prep_val_pipe.fit(x_train) #, y_train x_val = prep_val_pipe.transform(x_test) pipe.named_steps.clf.validation = (x_val, y_test) pipe.steps = pipe.steps[:-1] + [ ("dim_print", DimPrinter()) ] + pipe.steps[-1:] # add a print of X dimension pipe.fit(x_train, y_train) pred_train = pipe.predict(x_train) pred_test = pipe.predict(x_test) score_train = metric(pred_train, y_train) score_test = metric(pred_test, y_test) gen = { "score_train": score_train, "score_test": score_test, "fitted_pipe": pipe } return gen
def preprocessing(self, file_name, column_num, ratio, random_state): ''' 对划分后的样本进行标准化 按指定比例和指定随机状态进行样本划分 :param file_name: 要进行处理的sklearn初始样本 :param column_num: 要进行分割的列号 :param ratio: 训练集和测试集的比例 :param random_state: 划分训练集测试集的随机状态 :return: ''' data = np.loadtxt(file_name, dtype=str, delimiter=' ') # 将txt文件按第column_num, 进行列分割 y, x = np.split(data, [column_num], axis=1) x_train, x_test, y_train, y_test = train_test_split( x, y, random_state=random_state, train_size=ratio) return x, x_train, x_test, y, y_train, y_test
def train(): X = [] Y = [] X1, Y1 = loadSample("ss.txt", 0) X2, Y2 = loadSample("good.txt", 1) X = np.array(X1 + X2) Y = np.array(Y1 + Y2) x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=1, train_size=0.8) clf = svm.SVC(C=30, kernel='rbf', gamma=2.5, decision_function_shape='ovr') rf = clf.fit(x_train, y_train.ravel()) joblib.dump(rf, 'rf.model') # 保存模型文件 print "训练集正确率:", clf.score(x_train, y_train) # 训练集正确率 print "测试集正确率:", clf.score(x_test, y_test) # 测试集正确率
def setUpClass(cls): # Classification use-case cls.X_c, cls.y_c = make_moons(1000, noise=0.5) cls.X_c = pd.DataFrame(cls.X_c, columns=['F1', 'F2']) cls.target_names = ['class 0', 'class 1'] cls.X_train_c, cls.X_test_c, cls.y_train_c, cls.y_test_c = train_test_split( cls.X_c, cls.y_c) cls.classifier_est = DecisionTreeClassifier(max_depth=5, random_state=5) cls.classifier_est.fit(cls.X_train_c, cls.y_train_c) cls.interpreter = Interpretation(cls.X_train_c, feature_names=cls.X_c.columns) cls.model_inst = InMemoryModel(cls.classifier_est.predict, examples=cls.X_train_c, model_type='classifier', unique_values=[0, 1], feature_names=cls.X_c.columns, target_names=cls.target_names, log_level=_INFO)
def test_multivariate(self): def ignore_scalar_warning(): warnings.filterwarnings( "ignore", category=UserWarning, message="All the covariates are scalar.") X, y = make_regression(n_samples=20, n_features=10, random_state=1, bias=3.5) X_train, X_test, y_train, _ = train_test_split( X, y, random_state=2) for regularization_parameter in [0, 1, 10, 100]: with self.subTest( regularization_parameter=regularization_parameter): sklearn_l2 = Ridge(alpha=regularization_parameter) skfda_l2 = LinearRegression( regularization=L2Regularization( regularization_parameter=regularization_parameter), ) sklearn_l2.fit(X_train, y_train) with warnings.catch_warnings(): ignore_scalar_warning() skfda_l2.fit(X_train, y_train) sklearn_y_pred = sklearn_l2.predict(X_test) with warnings.catch_warnings(): ignore_scalar_warning() skfda_y_pred = skfda_l2.predict(X_test) np.testing.assert_allclose( sklearn_l2.coef_, skfda_l2.coef_[0]) np.testing.assert_allclose( sklearn_l2.intercept_, skfda_l2.intercept_) np.testing.assert_allclose( sklearn_y_pred, skfda_y_pred)
def driver(): dataset = build() delaylist = [ 'ArrDelay', 'DepDelay', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay' ] #plotStats(dataset, plotlist1, 'SFO') #print(dataset.columns.tolist()) dataset = dataset.reset_index() dataset.fillna(0) #Converting categorical features to numerics dataset["Dest"] = dataset["Dest"].astype('category') dataset["Dest"] = dataset["Dest"].cat.codes #dataset = dataset.sample(n=20000) dataset['Date'] = dataset['Date'].apply(lambda x: x.timestamp()) dataSFO = dataset.loc[dataset['Origin'].isin(['SFO'])] dataOAK = dataset.loc[dataset['Origin'].isin(['OAK'])] dataSFO = dataSFO.iloc[0:10000] dataOAK = dataOAK.iloc[0:10000] frames = [dataSFO, dataOAK] NNdata = pd.concat(frames) #NNdata = NNdata.sample(n=20000) labels = NNdata["Origin"] NNdata.drop('Origin', axis=1, inplace=True) delayset = dataset[delaylist] c1 = dataset.DayOfWeek.unique() #labels = dataset["Origin"] le = LabelEncoder() labels = le.fit_transform(labels) labels = np_utils.to_categorical(labels, 2) data = NNdata x_train, x_test, y_train, y_test = train_test_split(data, labels, train_size=0.8) FeedForward(x_train, x_test, y_train, y_test, len(NNdata.dtypes))
def process_gsc__data_con(): concatenated_data = extract_gsc_data_con() concatenated_data = pd.DataFrame(concatenated_data) BigSigma = concatenated_data.cov() concatenated_data = concatenated_data.loc[:, (BigSigma != 0).any(axis=0)] BigSigma = pd.DataFrame(concatenated_data).cov() BigSigma = np.diag(np.diag(BigSigma)) BigSigma_inv = np.linalg.inv(BigSigma) concatenated_data = concatenated_data.values print(concatenated_data.shape, BigSigma_inv.shape) train, test_and_val, train_out, test_and_val_out = train_test_split( concatenated_data, target, test_size=0.3, shuffle=True) train = np.array(train) pivot = int(len(test_and_val) / 2) test = test_and_val[:pivot] val = test_and_val[pivot:] test_out = test_and_val_out[:pivot] val_out = test_and_val_out[pivot:] # # print(len(concatenated_data)) return train, test, val, train_out, test_out, val_out, BigSigma_inv
def load_data(extend_disgust): ''' extract data from 'fer2013.csv' file extend_digust: whether to extend disgust class return: numpy array -like train_X: shape(?,48,48) validation_X: shape(?,48,48) train_y: shape(?, ) validation_y: shape(?, ) ''' data = pd.read_csv("../../dataset/fer2013/fer2013.csv") X = [] y = [] for (pixels, emotion) in zip(data['pixels'], data['emotion']): #if emotion == 0 or emotion == 1 or emotion == 2: # continue img = np.array((pixels.split(' ')), dtype=uint8 ) img = img.reshape((48, 48)) #img = cv2.equalizeHist(img) y.append(emotion) X.append(img) if extend_disgust: #extend disgust facial expression data, inorder to overcome the problem that class 'digust' has much less sample than other class. disgust_image = np.load('../../dataset/fer2013/extend_disgust.npy') X.extend(disgust_image) y.extend(np.ones((len(disgust_image),))) X = np.array(X, dtype=uint8) y = np.array(y, dtype=uint8) X = X.astype('float32') train_X, validation_X, train_y, validation_y = \ train_test_split(X, y, test_size=0.2, random_state = 0) return train_X, validation_X, train_y, validation_y
def runTheLinearSVC(): docs_train, docs_test, y_train, y_test = train_test_split( data['tweet'], data['mvmt'], test_size=0.25, random_state=None) # TASK: Build a vectorizer / classifier pipeline that filters out tokens # that are too rare or too frequent pipeline = Pipeline([ ('vect', TfidfVectorizer(min_df=3, max_df=0.95)), ('clf', LinearSVC(C=1000)), ]) # TASK: Build a grid search to find out whether unigrams or bigrams are # more useful. # Fit the pipeline on the training set using grid search for the parameters parameters = { 'vect__ngram_range': [(1, 1), (1, 2)], } grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1) grid_search.fit(docs_train, y_train) # TASK: print the mean and std for each candidate along with the parameter # settings for all the candidates explored by grid search. n_candidates = len(grid_search.cv_results_['params']) for i in range(n_candidates): print( i, 'params - %s; mean - %0.2f; std - %0.2f' % (grid_search.cv_results_['params'][i], grid_search.cv_results_['mean_test_score'][i], grid_search.cv_results_['std_test_score'][i])) # TASK: Predict the outcome on the testing set and store it in a variable # named y_predicted y_predicted = grid_search.predict(docs_test) # Print the classification report print(metrics.classification_report(y_test, y_predicted)) # Print and plot the confusion matrix cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) plt.matshow(cm) plt.show()
def fit2(self, X, y, feature_names): X_train, X_eval, y_train, y_eval = train_test_split( X, y, stratify=y, test_size=0.20, random_state=np.random.randint(50, 1000)) print self.params print 'LightGBM: training ... ' eval_result = {} lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train) self.gbm = lgb.train( self.params, lgb_train, num_boost_round=self.params['n_estimators'], valid_sets=[lgb_train, lgb_eval], verbose_eval=self.params['verbose_eval'], evals_result=eval_result, early_stopping_rounds=self.params['early_stopping_rounds'], feature_name=feature_names) return self.gbm, eval_result
import matplotlib.pyplot as plt, numpy as np from sklearn.datasets import make_moons from sklearn.tree import DecisionTreeClassifier from InterpretableDecisionTreeClassifier import IDecisionTreeClassifier from treeutils import tree_to_code from sklearn.model_selection._split import train_test_split from sklearn.metrics import f1_score X, y = make_moons(300, noise=0.4) Xtrain, Xtest, ytrain, ytest = train_test_split(X, y) clf1 = DecisionTreeClassifier(max_depth=4).fit(Xtrain, ytrain) clf2 = IDecisionTreeClassifier(max_depth=4).fit(Xtrain, ytrain) print("=== original decision tree ===") features = ["ft" + str(i) for i in range(X.shape[1])] print(tree_to_code(clf1, features)) # output large tree print("=== simplified (interpretable) decision tree ===") print(tree_to_code(clf2, features)) h = 0.02 x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) plt.subplot(1, 2, 1) plt.title("original decision tree. F1: " + str(f1_score(ytest, clf1.predict(Xtest)))) Z = clf1.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] Z = Z.reshape(xx.shape) plt.contourf(xx, yy, Z, alpha=.8)
def train_and_test(alpha, predictors, predictor_params, x_filename, y_filename, n_users, percTest, featureset_to_use, diff_weighting, phi, force_balanced_classes, do_scaling, optimise_predictors, report, conf_report=None): # all_X = numpy.loadtxt(x_filename, delimiter=",") all_X = numpy.load(x_filename + ".npy") all_y = numpy.loadtxt(y_filename, delimiter=",") print("loaded X and y files", x_filename, y_filename) if numpy.isnan(all_X.any()): print("nan in", x_filename) exit() if numpy.isnan(all_y.any()): print("nan in", y_filename) exit() #print("selecting balanced subsample") print("t t split") X_train, X_test, y_train, y_test = train_test_split(all_X, all_y, test_size=percTest, random_state=666) # feature extraction # test = SelectKBest(score_func=chi2, k=100) # kb = test.fit(X_train, y_train) # # summarize scores # numpy.set_printoptions(precision=3) # print(kb.scores_) # features = kb.transform(X_train) # mask = kb.get_support() # # summarize selected features # print(features.shape) # X_train = X_train[:,mask] # X_test = X_test[:,mask] scaler = StandardScaler() rdim = FeatureAgglomeration(n_clusters=100) if do_scaling: # input(X_train.shape) X_train = rdim.fit_transform(X_train) X_test = rdim.transform(X_test) X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) with open('../../../isaac_data_files/qutor_scaler.pkl', 'wb') as output: pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL) with open('../../../isaac_data_files/qutor_rdim.pkl', 'wb') as output: pickle.dump(rdim, output, pickle.HIGHEST_PROTOCOL) # print("feature reduction...") # pc = PCA(n_components=100) # X_train = pc.fit_transform(X_train) # X_test = pc.transform(X_test) classes = numpy.unique(y_train) sample_weights = None if (force_balanced_classes): X_train, y_train = balanced_subsample(X_train, y_train, 1.0) #0.118) print("X_train shape:", X_train.shape) print("X_test shape:", X_test.shape) print("tuning classifier ...") for ix, p in enumerate(predictors): print(type(p)) print(p.get_params().keys()) if optimise_predictors == True and len(predictor_params[ix]) > 1: pbest = run_random_search(p, X_train, y_train, predictor_params[ix]) else: pbest = p.fit(X_train, y_train) predictors[ix] = pbest print("pickling classifier ...") for ix, p in enumerate(predictors): p_name = predictor_params[ix]['name'] with open( '../../../isaac_data_files/p_{}_{}_{}.pkl'.format( p_name, alpha, phi), 'wb') as output: pickle.dump(p, output, pickle.HIGHEST_PROTOCOL) print("done!") # report.write("* ** *** |\| \` | | |) /; `|` / |_| *** ** *\n") # report.write("* ** *** | | /_ |^| |) || | \ | | *** ** *\n") #report.write("RUNS,P,FB,WGT,ALPHA,PHI,SCL,0p,0r,0F,0supp,1p,1r,1F,1supp,avg_p,avg_r,avg_F,#samples\n") for ix, p in enumerate(predictors): report.write(",".join( map(str, (all_X.shape[0], str(p).replace(",", ";").replace( "\n", ""), force_balanced_classes, diff_weighting, alpha, phi, do_scaling)))) y_pred_tr = p.predict(X_train) y_pred = p.predict(X_test) # for x,y,yp in zip(X_train, y_test, y_pred): if conf_report: conf_report.write( str(p).replace(",", ";").replace("\n", "") + "\n") conf_report.write(str(alpha) + "," + str(phi) + "\n") conf_report.write(str(confusion_matrix(y_test, y_pred)) + "\n") conf_report.write("\n") # p = precision_score(y_test, y_pred, average=None, labels=classes) # r = recall_score(y_test, y_pred, average=None, labels=classes) # F = f1_score(y_test, y_pred, average=None, labels=classes) p, r, F, s = precision_recall_fscore_support(y_test, y_pred, labels=classes, average=None, warn_for=('precision', 'recall', 'f-score')) avp, avr, avF, _ = precision_recall_fscore_support( y_test, y_pred, labels=classes, average='weighted', warn_for=('precision', 'recall', 'f-score')) for ix, c in enumerate(classes): report.write(",{},{},{},{},{},".format(c, p[ix], r[ix], F[ix], s[ix])) report.write("{},{},{},{}\n".format(avp, avr, avF, numpy.sum(s))) # report.write(classification_report(y_test, y_pred)+"\n") # report.write("------END OF CLASSIFIER------\n") report.flush() return X_train, X_test, y_pred_tr, y_pred, y_test, scaler
import numpy as np from nltk.chunk.util import accuracy pd.set_option('display.max_columns', None) pd.set_option('display.width', 200) data = pd.read_csv('../testdata/weather.csv' ) # RainTomorrow 종속, yes/no를 숫자로 변경시켜야함. 나머지열 독립 # print(data) # (366, 12) data2 = pd.DataFrame() data2 = data.drop(['Date', 'RainToday'], axis=1) # 제외할 칼럼 data2.RainTomorrow = data2.RainTomorrow.map({'Yes': 1, 'No': 0}) print(data2) # train / test dataset으로 분리 (overfitting 과적합 방지) train, test = train_test_split(data2, test_size=0.3, random_state=52) #random_state == random.seed() print(data2.shape, train.shape, test.shape) # (366, 10) (256, 10) (110, 10) # 분류 모델 col_select = " + ".join(train.columns.difference(['RainTomorrow'])) my_formula = 'RainTomorrow ~ ' + col_select # model = smf.glm(formula=my_formula, data=train, family=sm.families.Binomial()).fit() # 분류를 위한 학습모델 생성 model = smf.logit(formula=my_formula, data=train).fit() print(model.summary()) # P>|z| 0.05보다 큰 변수들은 독립변수로서 부적절하다고 볼 수 있다. # print(model.params.values) print('예측값 :', np.rint(model.predict(test)[:10].values)) print('실제값 :', test.RainTomorrow[:10].values) # 분류 정확도 from sklearn.metrics import accuracy_score
import pandas as pd import numpy as np pd.set_option('display.max_columns', None) pd.set_option('display.width', 200) iris = datasets.load_iris() print(iris.keys()) # =========================================================== x = iris.data[:, [2, 3]] # 모든 행의 2,3열, petal.length, petal.width의 2 칼럼으로 꽃의 종류 3가지로 분류 y = iris.target print(x[:3]) print(y[:3], ' ', set(y)) # train / test 나누기 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) print(x_train.shape, x_test.shape, y_train.shape, y_test.shape) #(105, 2) (45, 2) (105,) (45,) # (지금은 꼭 필요한 건 아닌지만 필요할 때가 있는) 스케일링 (데이터 크기 표준화, 전체 자료의 분포를 평균 0, 분산 1이 되도록) print(x_train[:3]) # [[3.5 1. ] [5.5 1.8] [5.7 2.5]] print(x_test[:3]) # [[5.1 2.4] [4. 1. ] [1.4 0.2]] sc = StandardScaler() sc.fit(x_train) sc.fit(x_test) x_train = sc.transform(x_train) x_test = sc.transform(x_test) print(x_train[:3]) # [[-0.05624622 -0.18650096] [ 1.14902997 0.93250481] [ 1.26955759 1.91163486]] print(x_test[:3]) # [[ 0.90797473 1.77175914] [ 0.24507283 -0.18650096] [-1.32178623 -1.30550673]]