def nldas_correlate(self): nldas_list = pickle.load(open("wind/nldas.p", "rb")) nldas = None for temp in nldas_list: if self.station_id == temp.station_id: nldas = temp break nldas_idx = np.where(np.logical_and(nldas.date >= np.min(self.date), nldas.date <= np.max(self.date))) nldas_wind_speed_anomaly = nldas.wind_speed_anomaly[nldas_idx] nldas_wind_dir_anomaly = nldas.wind_dir_anomaly[nldas_idx] fit_lr = lr() # mask1 = self.reject_outliers(self.wind_speed_anomaly) mask1 = ~np.isnan(self.wind_speed_anomaly) fit_lr.fit(nldas_wind_speed_anomaly[mask1].reshape((len(nldas_wind_speed_anomaly[mask1]), 1)), self.wind_speed_anomaly[mask1]) result1 = fit_lr.predict(nldas_wind_speed_anomaly[mask1].reshape((len(nldas_wind_speed_anomaly[mask1]), 1))) std = np.sqrt(np.sum((self.wind_speed_anomaly[mask1] - result1) ** 2) / (len(result1) - 2)) print "Standard deviation of the wind speed estimate is", std fit_lr = lr() # mask2 = self.reject_outliers(self.wind_dir_anomaly) mask2 = ~np.isnan(self.wind_dir_anomaly) fit_lr.fit(nldas_wind_dir_anomaly[mask2].reshape((len(nldas_wind_dir_anomaly[mask2]), 1)), self.wind_dir_anomaly[mask2]) result2 = fit_lr.predict(nldas_wind_dir_anomaly[mask2].reshape((len(nldas_wind_dir_anomaly[mask2]), 1))) std = np.sqrt(np.sum((self.wind_dir_anomaly[mask2] - result2) ** 2) / (len(result2) - 2)) print "Standard deviation of the wind direction estimate is", std fig = plt.figure() ax1 = fig.add_subplot(211) ax1.plot(nldas_wind_speed_anomaly[mask1], self.wind_speed_anomaly[mask1], '.b') ax1.plot(nldas_wind_speed_anomaly[mask1], result1, '-r') ax2 = fig.add_subplot(212) ax2.plot(nldas_wind_dir_anomaly[mask2], self.wind_dir_anomaly[mask2], '.g') ax2.plot(nldas_wind_dir_anomaly[mask2], result2, '-r') plt.show()
def main(): plotdir = make_plotdir() train_X, test_X, train_y, test_y = load_data('cleveland', plotdir, print_out=False) # X_labels = list(train_X.columns) test_incoming(test_X, train_X) plot_hists(train_X, plotdir, label='Train') plot_hists(test_X, plotdir, label='Test') scale_cols = ['age','b_pressure','cholesterol','heart_rate','exer_depress','fluor_count'] train_X, test_X = scale_data(train_X, test_X, scale_cols) # one_hot_cols = ['chest_pain','ecg_type','exer_slope','thal_defect'] one_hot_cols = ['chest_pain'] train_X, test_X = one_hot_encode(train_X, test_X, one_hot_cols) # print('one hot encode train_X head\n', train_X[:3]) X_labels = list(train_X.columns) clf = lr() fit_predict(clf, train_X, train_y, test_X, test_y, label='logistic') cross_validate(clf, train_X, train_y['Y'], print_out=True) print_lr_coefs(clf, X_labels) clf = LinearSVC() # data must first be scaled fit_predict(clf, train_X, train_y, test_X, test_y, label='svc') cross_validate(clf, train_X, train_y['Y'], print_out=True) explore_pca(train_X)
def classify(train_data_filename, train_label_filename, dev_data_filename, dev_label_filename, train_feature_dir, dev_feature_dir, feature_list, model_type='LR', regularizer='l1', alpha=1.0, converg_tol=0.01, verbose=1, folds=2, n_jobs=-1, score_eval='f1'): if model_type == 'LR': model = lr(penalty=regularizer, C=alpha, tol=converg_tol) elif model_type == 'SVM': model = svm.LinearSVC(penalty=regularizer, C=alpha, tol=converg_tol) else: sys.exit('Model type ' + model_type + ' not supported') train_X, train_Y = load_features(train_data_filename, train_label_filename, train_feature_dir, feature_list, verbose) #if we have separate dev data, so we don't need cross validation if folds < 1: # Try loading dev data using train vocabulary, and not saving dev feature extractions dev_X, dev_Y = load_features(dev_data_filename, dev_label_filename, dev_feature_dir, feature_list, verbose, vocab_source=train_feature_dir) dev_f1, dev_acc, train_f1, train_acc = compute_evaluation_metrics(train_X, train_Y, dev_X, dev_Y, model) print('train acc: ' + str(train_acc)) print('dev acc: ' + str(dev_acc)) neg_loss = dev_acc #if we don't have separate dev data, so we need cross validation else: skf = StratifiedKFold(train_Y, folds,random_state=17) neg_loss = cross_val_score(model, train_X, train_Y, cv=skf,scoring=score_eval,n_jobs=n_jobs).mean() print('crossvalidation f1: ' + str(f1)) return {'loss': -neg_loss, 'status': STATUS_OK, 'model': model}
def trainModel(): fh = open('train.features') X = [] for x in fh: x = x.strip() x = x.split(',') x = [int(x1) for x1 in x] X.append(x) fh.close() fh = open('train.labels') Y = [] for y in fh: y = y.strip() Y.append(int(y)) fh.close() clf = lr() clf.fit(X,Y) print sigmoid(clf.predict(X[45])) print clf.coef_ #np.save("lr_coeff",clf.coef_) print clf.intercept_ #np.save("lr_intercept",clf.intercept_) score = np.dot(clf.coef_, X[45])+ clf.intercept_ print sigmoid(score) coeff = np.load("lr_coeff.npy") intercept = np.load("lr_intercept.npy") score = np.dot(coeff, X[45]) + intercept print sigmoid(score)
def predict_lr(X_train, X_test, y_train, y_test): clf = lr() print("lr started") clf.fit(X_train,y_train) y_pred=clf.predict(X_test) calc_accuracy("Logistic regression",y_test,y_pred) np.savetxt('submission_surf_lr.csv', np.c_[range(1,len(y_test)+1),y_pred,y_test], delimiter=',', header = 'ImageId,Label,TrueLabel', comments = '', fmt='%d') return clf
def main(): if (not path.exists('training_images_pos0.npy') and not path.exists('training_image_neg0.npy')): convert_to_numpy() # (# samples, 256, 128, 3) train_pos = np.load('training_images_pos0.npy') train_neg = np.load('training_images_neg0.npy') print(train_pos.shape) print(train_neg.shape) # flatten images train_pos = preprocessing.minmax_scale( train_pos.reshape((train_pos.shape[0], 32768))) train_neg = preprocessing.minmax_scale( train_neg.reshape((train_neg.shape[0], 32768))) pos_label = np.ones(train_pos.shape[0]) neg_label = np.zeros(train_neg.shape[0]) trainX = np.concatenate((train_pos, train_neg)) trainY = np.concatenate((pos_label, neg_label)) idxs = np.random.permutation(trainX.shape[0]) trainX = trainX[idxs] trainY = trainY[idxs] # train_pos = np.concatenate((train_pos, pos_label), axis=1) # train_neg = np.concatenate((train_neg, neg_label), axis=1) model = lr() clf = model.fit(trainX, trainY) pickle.dump(model, open('model.sav', 'wb')) test_pos = np.load('testing_images_pos0.npy') test_neg = np.load('testing_images_neg0.npy') print(test_pos.shape) print(test_neg.shape) test_pos = preprocessing.minmax_scale( test_pos.reshape((test_pos.shape[0], 32768))) test_neg = preprocessing.minmax_scale( test_neg.reshape((test_neg.shape[0], 32768))) pos_label_test = np.ones(test_pos.shape[0]) neg_label_test = np.zeros(test_neg.shape[0]) testX = np.concatenate((test_pos, test_neg)) testY = np.concatenate((pos_label_test, neg_label_test)) idxs = np.random.permutation(testX.shape[0]) testX = testX[idxs] testY = testY[idxs] print(clf.score(testX, testY)) roc_auc_score(testY, clf.predict_proba(testX)[:, 1])
def run_lr(): clf = lr() print("lr started") clf.fit(x,y) #print clf.n_layers_ pred=clf.predict(x_) #print(pred) np.savetxt('submission_lr.csv', np.c_[range(1,len(test)+1),pred,label_test], delimiter=',', header = 'ImageId,Label,TrueLabel', comments = '', fmt='%d') calc_accuracy("Logistic regression",label_test,pred)
def main(): dfcol, dups = readRawColumns() dftrain, dftrain_y, dftest, dftest_y = readRawData(dfcol) dftrain = renameColumns(dftrain) dftest = renameColumns(dftest) print("dftrain shape head", dftrain.shape, "\n", dftrain[:3]) print("dftest shape head", dftest.shape, "\n", dftest[:3]) print("dftrain stats\n", dftrain.describe()) # groupby subject, activity(y) ? # print("dftrain group by subject stats\n", dftrain.groupby('subject').describe()) make_plotdir() explore_pca(dftrain, dftest, "all") # 562 columns clf = LinearSVC() print("fitting LinearSVC") fit_predict(clf, dftrain, dftrain_y, dftest, dftest_y, 'raw data, all cols') fit_predict(clf, dftrain.ix[:, :30], dftrain_y, dftest.ix[:, :30], dftest_y, 'raw data, 30 cols') # 30 columns not sorted by pca - only 70% accuracy X_train, X_test = quick_pca(dftrain, dftest, ncomps=100) print("fitting LinearSVC with PCA input") preds = [] for j in [10, 20, 30, 50, 100]: p = fit_predict(clf, X_train[:, :j], dftrain_y, X_test[:, :j], dftest_y, 'pca {:d} cols'.format(j)) preds.append((j, p)) plot_pca_fit(preds, "svc", "SVC") do_svc_gridsearch(X_train[:, :30], dftrain_y) print("Cross-validating LinearSVC with PCA input") get_cv_scores(clf, X_train[:, :30], dftrain_y) # randomized, not grouped by subject # 30 columns sorted by pca - 89% accuracy clf = lr() print("fitting Logistic Regression with PCA input") preds = [] for j in [10, 20, 30, 50, 100]: p = fit_predict(clf, X_train[:, :j], dftrain_y, X_test[:, :j], dftest_y, 'pca {:d} cols'.format(j)) preds.append((j, p)) plot_pca_fit(preds, "lr", "Logistic Regression") print("Cross-validating Logistic Regression with PCA input") get_cv_scores(clf, X_train[:, :30], dftrain_y) txt = '''\nConclusion: Using PCA as input to Logistic Regression or LinearSVC is effective, with 91% accuracy using only 30 components (5.4% of 562 total). For six predicted classes, a classification report shows precision of 85% and greater (also confirmed by confusion matrix). Cross-validation gives average fit scores of 89% +- 5%.''' print(txt)
def regressMissingData(x, y, xnew, robust=True): ''' linear or robust linear regression to fill in missing data. author: Nat input: x: independent variables with corresponding dependent variable y xnew: independent variables with MISSING dependent variable y y: dependent variable which is known output: ynew: regressed y value where y is missing ''' import pandas as pd from sklearn.linear_model import LinearRegression as lr m = lr() m.fit(x, y) ynew_lr = pd.DataFrame(m.predict(xnew), columns=['WON_MONTH2']) from sklearn.linear_model import RANSACRegressor as ransac m_ransac = ransac(lr()) m_ransac.fit(x, y) ynew_ransac = pd.DataFrame(m_ransac.predict(xnew), columns=['WON_MONTH2']) # import numpy as np # from matplotlib import pyplot as plt # yhat_lr = pd.DataFrame(m.predict(x)) # yhat_ransac = pd.DataFrame(m_ransac.predict(x)) # inlier_mask = m_ransac.inlier_mask_ # outlier_mask = np.logical_not(inlier_mask) # plt.scatter(x[inlier_mask], y[inlier_mask], # color='green', marker='.', # label='Inliers') # plt.scatter(x[outlier_mask], y[outlier_mask], # color='red', marker='.', # label='Outliers') # plt.plot(pd.concat([x,xnew]), pd.concat([yhat_ransac, ynew_ransac]), '-', # label='RANSAC regressor') # plt.plot(pd.concat([x,xnew]), pd.concat([yhat_lr, ynew_lr]), '-', # label='linear regressor') # plt.show() if robust == True: return ynew_ransac else: return ynew_lr
def train_edge_classification(X_train, Y_train): """ train the classifier with the train set. :param X_train: The features' edge- norm (train set). :param Y_train: The edges labels- 0 for true, 1 for false (train set). :return: The classifier """ classif2 = TopKRanker(lr()) classif2.fit(X_train, Y_train) return classif2
def get_model(self,args): if args['model']['model'] == 'LR': model = lr(penalty=args['model']['regularizer_lr'], C=args['model']['C_lr'],n_jobs=self.cjobs) elif args['model']['model'] == 'SVM': if args['model']['regularizer_svm'] == 'l1': #squared hinge loss not available when penalty is l1. model = svm.LinearSVC(C=args['model']['C_svm'], penalty=args['model']['regularizer_svm'],dual=False,n_jobs=self.cjobs)#loss='hinge') else: model = svm.LinearSVC(C=args['model']['C_svm'], penalty=args['model']['regularizer_svm'],n_jobs=self.cjobs) return model
def get_classifier(): x = query_features[query_features.columns[2:23]] y = query_features[query_features.columns[-1]] x_train, x_test, y_train, y_test = sk_model.train_test_split(x, y, test_size=0.2) clf = lr(max_iter=1000).fit(x_train, y_train) return clf
def fit_and_test(): data, target = pd.read_train() train_x, val_x, train_y, val_y = t(data, target, test_size=0.1) m = lr() m.fit(train_x, train_y) print("Score on validation") print(m.score(val_x, val_y))
def linear_regression(x, y): lineerreg = lr( ) #sklearn lineer regresyon modelini 'lineerreg' adıyla kullancağız lineerreg.fit( x, y) # örneğin veri üzerinde öğrenmesi fit fonksiyonuyla yapılıyor lineerreg.predict(x) #tahmin fonksiyoru m = lineerreg.coef_ #eğim b = lineerreg.intercept_ #b değeri plt.scatter(x, y) # matplotlib ile noktaları gösterme plt.plot(x, lineerreg.predict(x), c="red") # doğruyu çizdirme plt.show() # çizilen grafiği göster
def train_edge_classification(X_train, Y_train): """ Predictions of nodes' labels. :param X: The features' graph- norm :param Y: The edges labels- 0 for true, 1 for false :param test_ratio: To determine how to split the data into train and test :return: Scores- F1-macro, F1-micro accuracy and auc """ classif2 = TopKRanker(lr()) classif2.fit(X_train, Y_train) return classif2
def construct_all_models(self, hyperTune): if hyperTune: #3 models KNN SCM and LR self.models={'SVM':[SVC(kernel='linear',probability=True),dict(C=np.arange(0.01, 2.01, 0.2))],\ 'LogisticRegression':[lr(),dict(C=np.arange(0.1,3,0.1))],\ 'KNN':[KNeighborsClassifier(),dict(n_neighbors=range(1, 100))],} for name, candidate_hyperParam in self.models.items(): #update each classifier after training and tuning self.models[name] = self.train_with_hyperParamTuning( candidate_hyperParam[0], name, candidate_hyperParam[1]) print('\nTraining process finished\n\n\n')
def eval_node_classification(X_train, Y_train, X_test, Y_test): # y_train = (n_sample, n_classes) top_k_list = list(Y_test.sum(axis=1)) classif2 = TopKRanker(lr(solver='liblinear')) classif2.fit(X_train, Y_train) prediction = classif2.predict(X_test, top_k_list) micro = f1_score(Y_test, prediction, average='micro') macro = f1_score(Y_test, prediction, average='macro') return micro, macro
def getR2(y_actual, factor, isRet=False): n = len(y_actual) y = np.array(y_actual).reshape((n, 1)) x = np.array(factor).reshape((n, 1)) if isRet: n = n - 1 y = np.log(y[1:] / y[:-1]) x = x[:-1] reg = lr() reg.fit(x, y) return r2_score(y, reg.predict(x))
def explore_params(loans_X, loans_y, plotdir, app, appf): '''Explore fit parameters on training data, grid search of fit scores, boxplot gridsearch results.''' clf = lr() param_grid = [{'C': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 10.0]}] gs = GridSearchCV(estimator=clf, param_grid=param_grid, cv=10, \ verbose=1, n_jobs=-1, scoring='accuracy') gs.fit(loans_X, loans_y) # fit all grid parameters print("gs grid scores\n", gs.grid_scores_) print("gs best score %.5f %s\n%s" % \ (gs.best_score_, gs.best_params_, gs.best_estimator_)) gridscore_boxplot(gs.grid_scores_, plotdir, app, appf, "C", "solver='liblinear'")
def cal_linear_reg_r(y, x=None): ''' 计算y中数据点的斜率(一元线性回归) y和x为list或pd.Series或np.array ''' if isnull(x): X = pd.DataFrame({'X': range(0, len(y))}) else: X = pd.DataFrame({'X': x}) y = pd.Series(y) mdl = lr().fit(X, y) return mdl.coef_[0], mdl.intercept_
def myLr(x, y, xnew): ''' calls sklearn.linear_model.LinearRegression wrapper author: Nat ''' from sklearn.linear_model import LinearRegression as lr import numpy as np model = lr() model.fit(x, y) ynew = model.predict(xnew) ynew = np.where(ynew < 0, 0, ynew) return ynew
def _plotDegreedist(degree_df, plot_model=False, path=None): """ Args: degree_df (pandas.DataFrame): data_frame that include degree. degree info shold be stored in the column, "degree" plot_model (bool): Whether to plot linear approximation line. path (str): Folder path to save plots. If the folde does not exist in the path, the function create the folder. If None, plots will not be saved. Default is None. """ from sklearn.linear_model import LinearRegression as lr df = degree_df.copy() dist = df.degree.value_counts() / df.degree.value_counts().sum() dist.index = dist.index.astype(np.int) fig, ax = plt.subplots(1, 2) ax[0].scatter(dist.index.values, dist.values, c="black") ax[0].set_title("degree distribution") ax[0].set_xlabel("k") ax[0].set_ylabel("P(k)") #plt.yscale('log') #plt.xscale('log') x = np.log(dist.index.values).reshape([-1, 1]) y = np.log(dist.values).reshape([-1, 1]) if plot_model: model = lr() model.fit(x, y) x_ = np.array([-1, 5]).reshape([-1, 1]) y_ = model.predict(x_) ax[1].set_title( f"degree distribution (log scale)\nslope: {model.coef_[0][0] :.4g}, r2: {model.score(x,y) :.4g}" ) ax[1].plot(x_.flatten(), y_.flatten(), c="black", alpha=0.5) else: ax[1].set_title(f"degree distribution (log scale)") ax[1].scatter(x.flatten(), y.flatten(), c="black") ax[1].set_ylim([y.min() - 0.2, y.max() + 0.2]) ax[1].set_xlim([-0.2, x.max() + 0.2]) ax[1].set_xlabel("log k") ax[1].set_ylabel("log P(k)") if path is not None: fig.savefig(path, transparent=True) plt.show()
def regression(self, metric="root_mean_squared_error", folds=10, alphas=[], graph=False): size = 1.3 * self.report_width // 10 models = {} models["Linear regressor"] = lr() models["Lasso regressor"] = lassor() models["Lasso CV regressor"] = lassocvr() models["Ridge regressor"] = rr(alpha=0, normalize=True) models["Ridge CV regressor"] = rcvr(alphas = alphas) models["K nearest neighbors regressor K2u"] = knnr(n_neighbors=2, weights='uniform') models["K nearest neighbors regressor K2d"] = knnr(n_neighbors=2, weights='distance') models["K nearest neighbors regressor K5"] = knnr(n_neighbors=5) models["K nearest neighbors regressor K10"] = knnr(n_neighbors=10) models["SGD regressor"] = sgdr(max_iter=10000, warm_start=True) models["Decision tree regressor"] = dtr() models["Decision tree regressor D3"] = dtr(max_depth=3) models["Random forest regressor"] = rfr() models["Ada boost regressor"] = abr() models["Gradient boost regressor"] = gbr() models["Support vector regressor"] = svr() self.models = models print('\n') print(self.report_width * '*', '\n*') print('* REGRESSION RESULTS - BEFORE PARAMETERS BOOSTING \n*') #kf = StratifiedKFold(n_splits=folds, shuffle=True) kf = KFold(n_splits=folds) results = [] names = [] for model_name in models: cv_scores = -1 * cross_val_score(models[model_name], self.Xt_train, self.yt_train.values.ravel(), cv=kf, scoring=metric) results.append(cv_scores) names.append(model_name) print(self.report_width * '*', '') report = pd.DataFrame({'Regressor': names, 'Score': results}) report['Score (avg)'] = report.Score.apply(lambda x: x.mean()) report['Score (std)'] = report.Score.apply(lambda x: x.std()) report['Score (VC)'] = 100 * report['Score (std)'] / report['Score (avg)'] report.sort_values(by='Score (avg)', inplace=True) report.drop('Score', axis=1, inplace=True) display(report) print('\n') if graph: fig, ax = plt.subplots(figsize=(size, 0.5 * size)) plt.title('Regressor Comparison') #ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.xticks(rotation=45) plt.subplots_adjust(hspace=0.0) plt.show() return None
def test(self, model_name, graph=False): size = 1.3 * self.report_width // 10 model = self.models[model_name] # fit using the train subset X, y = self.Xt_train, self.yt_train model.fit(X, y) # evaluate using the test subset X, y = self.Xt_test, self.yt_test if self.strategy == 'regression': y_hat = model.predict(X) # show residual analysis self.residual(y, y_hat, model_name, graph) if graph: # show the correlation between y and y_hat fig, ax = plt.subplots(figsize=(size, 0.5 * size)) plt.title('Model Overall Performance') plt.scatter(y, y_hat, color='g') viewer = lr() plt.plot(y, viewer.fit(y, y_hat).predict(y), color='k') plt.xlabel('Observed') plt.ylabel('Predicted') plt.show() else: y_pred = model.predict(X) sample_size = len(y_pred) print('\n') print(self.report_width * '*', '\n*') print('* MODEL PERFORMANCE \n*') print('* MODEL NAME: ', model_name) print('* TEST SAMPLE SIZE: ', sample_size) print('* ACCURACY: ', round(accuracy_score(y, y_pred)*100, 1), '%') print('* ') print(self.report_width * '*', '\n') report = classification_report(y, y_pred, output_dict=True) if graph: fig, ax = plt.subplots(figsize=(size, 0.3 * size)) plt.title('Confusion Matrix') sns.heatmap(confusion_matrix(y, y_pred), annot=True, cmap='YlGn', fmt='d',) plt.xlabel('Predicted') plt.ylabel('True Class') plt.show() fig, ax = plt.subplots(figsize=(size, 0.5 * size)) plt.title('Classification Report') sns.heatmap(pd.DataFrame(report).iloc[0:3].T, annot=True, vmin=0, vmax=1, cmap='BrBG', fmt='.2g') plt.xlabel('Score') plt.show() else: display(pd.DataFrame(report).T) return None
def log_reg(x, y, t, q): # Logistic Regression predictor initialization pred = lr(solver="saga", max_iter=200, multi_class="multinomial", tol=0.1) start = timer() # Start timer pred.fit(x, y) # Predictor training pred.result = pred.score(t, q) # Predictor test pred.error = 1 - pred.result # error probability pred.end = timer() - start # End timer q = pred.predict(t) return q, pred
def gs(x, y="prob"): #70/30 train test split x_train, x_test, y_train, y_test = tts(x, x.label, test_size=0.3) data = x_train.iloc[:, :32] test_data = x_test.iloc[:, :32] #train model classifier = lr(random_state=0).fit(data, y_train) if y == "prob": pred = classifier.predict_proba(test_data) else: pred = classifier.predict(test_data) return pred, y_test.values
def model_stop(df): #df = pd.get_dummies(df,columns=['day']) #features = ['day_'+str(i) for i in range(0,7)] #for f in features: # if f not in df.columns: # df[f] = 0 df = df[df['traveltime'] < df['traveltime'].quantile(0.95)] features = ['rain','temp','vappr','hour','hour2','hour3','hour4','day','day2','day3','day4'] for i in range(2,5): df['hour'+str(i)] = df['hour'] ** i df['day'+str(i)] = df['day'] ** i model = lr(fit_intercept=True).fit(df[features],df['traveltime']) return model,df,features
def evaluateNodeClassification(X, Y, test_ratio): X_train, X_test, Y_train, Y_test = sk_ms.train_test_split( X, Y, test_size=test_ratio) try: top_k_list = list(Y_test.toarray().sum(axis=1)) except: top_k_list = list(Y_test.sum(axis=1)) classif2 = TopKRanker(lr()) classif2.fit(X_train, Y_train) prediction = classif2.predict(X_test, top_k_list) micro = f1_score(Y_test, prediction, average='micro') macro = f1_score(Y_test, prediction, average='macro') return (micro, macro)
def create_model(self, model_type, parameters): if model_type == 'lr': model = lr() elif model_type == 'svm': model = svm() elif model_type == 'mlp': model = mlp() elif model_type == 'rf': model = rf() elif model_type == 'xgb': model = xgb() return model.set_params(**parameters)
def version1(): # Logistic Regression Model train_test_split(df["reviewText"], df["Positivity"], 100) features_train_vectorized = cv().fit_transform(features_train) features_test_vectorized = cv().transform(features_test) model = lr().fit(features_train_vectorized, labels_train) # Model creation for logistic regression predictions = model.predict(features_test_vectorized) ras(labels_test, predictions) # Generating prediction score cm(labels_test, predictions) return model
def __init__(self, model_type=None, column_names=None, metric='f1', **kwargs): self.model_type = model_type self.column_names = column_names self.params = kwargs self.trained = None self.metric = metric if model_type == 'LR': if self.params.get('regularization', None) is None: self.params['regularization'] = 'l1' if self.params.get('alpha', None) is None: self.params['alpha'] = 1.0 self.model = lr(penalty=self.params['regularization'], C=self.params['alpha']) elif model_type == 'SVM' or model_type == 'SVMNB': if self.params.get('kernel', None) is None: self.params['kernel'] = 'rbf' if model_type == 'SVM': if self.params.get('alpha', None) is None: self.params['alpha'] = 0.1 else: # elif model_type == SVMNB: self.params['kernel'] = 'linear' if self.params.get('alpha', None) is None: self.params['alpha'] = 1 if self.params.get('beta', None) is None: self.params['beta'] = 0.25 if self.params['kernel'] == 'linear': # override regularization parameter to avoid a conflict self.params['regularization'] = 'l2' self.model = svm.LinearSVC(C=self.params['alpha']) else: # elif self.params['kernel'] != 'linear': if self.params.get('degree', None) is None: self.params['degree'] = 3 if self.params.get('gamma', None) is None: self.params['gamma'] = 0.0 if self.params.get('coef0', None) is None: self.params['coef0'] = 0.0 self.model = svm.SVC(C=self.params['alpha'], kernel=self.params['kernel'], degree=self.params['degree'], gamma=self.params['gamma'], coef0=self.params['coef0']) elif model_type == 'MNB': if 'alpha' not in self.params: self.params['alpha'] = 1.0 self.model = MultinomialNB(alpha=self.params['alpha'], fit_prior=True) elif model_type == 'myMNB': if 'alpha' not in self.params: self.params['alpha'] = 1.0 self.model = None else: self.model_type = 'default' self.model = None
def linear_model(self, nldas_wind, type = 'speed'): X = nldas_wind if type == 'speed': y = self.wind_speed_anomaly else: y = self.wind_dir_anomaly mask = ~np.isnan(y) X = X[mask].reshape((len(X[mask]), 1)) y = y[mask] lr_model = lr() lr_model.fit(X, y) est_y = lr_model.predict(X) std = np.sqrt(np.sum((est_y - y) ** 2) / (len(y) - 2)) return lr_model, std
def predict_lr(X, y, X_train, X_test, y_train, y_test): clf = lr(solver='lbfgs', multi_class='ovr') print("======Logistic Regression======") clf.fit(X_train, y_train) pickle.dump(clf, open('logreg_trained_new.sav', 'wb')) y_pred = clf.predict(X_test) calc_accuracy("Logistic regression", y_test, y_pred) np.savetxt('submission_surf_lr.csv', np.c_[range(1, len(y_test) + 1), y_pred, y_test], delimiter=',', header='ImageId,Label,TrueLabel', comments='', fmt='%d')
def LR_from_cfg(params): X_ = X[:] clf = lr(**params) if params['penalty'] == 'l2': if params['dual'] is True: if params['solver'] == 'liblinear': if params['multi_class'] == 'multinomial': return 1 - 0.001 else: return 1 - cross_val_score(clf, X_, y, cv=5).mean() else: return 1 - 0.001 else: if params['solver'] == 'liblinear' and params[ 'multi_class'] == 'multinomial': return 1 - 0.001 else: return 1 - cross_val_score(clf, X_, y, cv=5).mean() elif params['penalty'] == 'l1': if params['dual'] is True: return 1 - 0.001 else: if params['solver'] == 'liblinear': if params['multi_class'] == 'multinomial': return 1 - 0.001 else: return 1 - cross_val_score(clf, X_, y, cv=5).mean() elif params['solver'] == 'saga': return 1 - cross_val_score(clf, X_, y, cv=5).mean() else: return 1 - 0.001 elif params['penalty'] == 'elasticnet': if params['dual'] is True: return 1 - 0.001 else: if params['solver'] == 'saga': return 1 - cross_val_score(clf, X_, y, cv=5).mean() else: return 1 - 0.001 elif params['penalty'] == 'none': if params['dual'] is True: return 1 - 0.001 else: if params['solver'] == 'liblinear': return 1 - 0.001 else: return 1 - cross_val_score(clf, X_, y, cv=5).mean() else: return 1 - cross_val_score(clf, X_, y, cv=5).mean()
def evaluateNodeClassification(X_train, X_test, Y_train, Y_test): try: top_k_list = list(Y_test.toarray().sum(axis=1)) except: top_k_list = list(Y_test.sum(axis=1)) classif2 = TopKRanker(lr()) try: classif2.fit(X_train, Y_train) prediction = classif2.predict(X_test, top_k_list) except: print('Could not fit node classification model') prediction = np.zeros(Y_test.shape) micro = f1_score(Y_test, prediction, average='micro') macro = f1_score(Y_test, prediction, average='macro') return prediction
def log_reg(x, y, t, q): """ This function is an amalgamation of different minute tasks that I just gatherd into a singal call function to ease work.""" pred = lr(solver = "saga", tol = 0.001, max_iter = 600, n_jobs = -1, fit_intercept = True) pred.fit(x,y) # Predictor training g = pred.score(t,q) # Predictor test pred = pred.predict(t) # Predicting correct labels # Printing some information for user print("------------------------------------------") print("accuracy rate is %{}" .format(round(g * 100 , 3))) print("Error rate is %{}" .format(round((1 - g) * 100 , 3))) return pred
def train_model(self): ''' Trains simple logistic regression using the class labels. No regularization. The Metonymi features do all of the heavy lifting! ''' print('TRAINING MODEL...') labels = self.frame[:, -1] frame = scale(self.frame[:, :-1]) self.train, self.test, self.train_labels, self.test_labels = \ tts(frame, labels, random_state=26, test_size=.15) self.model = lr(max_iter=200) self.model.fit(self.train, self.train_labels) print('DONE!\n') return True
def _train_SKLR_Classifier(extractedBases, lbls, params = {}): """ NLTK ME Training Wrapper""" Xtrn = makeSKFormat(extractedBases) ytrn = lbls C = params.get('C', 10) penalty = params.get('penalty', 'l1') class_weight = params.get('class_weight','auto') tol = params.get('tol', 1e-6) classifier = lr(C=C, penalty=penalty, class_weight=class_weight, tol=tol) classifier.fit(Xtrn,ytrn) return classifier, list(classifier.classes_)
def classify(data_filename, label_filename, feature_dir, list_of_features, model_type='LR', regularizer='l1', alpha=1.0, verbose=1): labels = pd.read_csv(label_filename, header=0, index_col=0) if not os.path.exists(feature_dir): os.makedirs(feature_dir) # for each feature in feature_list: items = None feature_matrices = [] column_names = [] print "Loading features" for feature in list_of_features: feature_description = feature rows, columns, counts = feature_loader.load_feature(feature_description, feature_dir, data_filename, verbose=1) if items is None: items = rows else: assert items == rows if verbose > 0: print "Loaded", feature, "with shape", counts.shape feature_matrices.append(counts) column_names.append(columns) # concatenate all features together X = sparse.csr_matrix(sparse.hstack(feature_matrices)) column_names = np.concatenate(column_names) if verbose > 0: print "Full feature martix size:", X.shape #return items, column_names, X if model_type == 'LR': model = lr(penalty=regularizer, C=alpha) elif model_type == 'SVM': model = svm.LinearSVC(C=alpha, penalty=regularizer) else: sys.exit('Model type ' + model_type + ' not supported') y = labels.as_matrix().ravel() model.fit(X, y) pred = model.predict(X) f1 = f1_score(y_true=y, y_pred=pred) print f1 return {'loss': -f1, 'status': STATUS_OK}
def classify_one_model(feature_list, model_type='LR', regularizer='l1', alpha=1.0, converg_tol=0.01, verbose=1, folds=2, n_jobs=-1, score_eval='f1'): if model_type == 'LR': model = lr(penalty=regularizer, C=alpha, tol=converg_tol) elif model_type == 'SVM': model = svm.LinearSVC(penalty=regularizer, C=alpha, tol=converg_tol) else: sys.exit('Model type ' + model_type + ' not supported') train_X, train_Y = load_features(train_data_filename, train_label_filename, train_feature_dir, feature_list, verbose) # Try loading dev data using train vocabulary, and not saving dev feature extractions dev_X, dev_Y = load_features(dev_data_filename, dev_label_filename, dev_feature_dir, feature_list, verbose, vocab_source=train_feature_dir) model.fit(train_X, train_Y) dev_pred_prob_Y = model.predict_proba(dev_X) return dev_pred_prob_Y, model, dev_Y
data.drop('F19', axis=1, inplace=True) selector = selector.fit(data, y) #print which features have been selected print "ATTRIBUTES WHICH HAVE BEEN SELECTED\n" for i in xrange(0,len(data.columns)): if(selector.support_[i]==True): print data.columns[i] df1=data[['FAC_NAME','F1','F2','F3','F4','F5','F6','F7','F8','F9','F10','F11','F12','F13','F14','F15','F16','F17','F18','F19','F20','F21','F22']] clf=SVC() #??? scores=cv1(clf,df1,y,cv=10) print "\nSVC Cross validated Scores:\n" print scores clf1=lr() scores1=cv1(clf1,df1,y,cv=10) print "\nLogistic Regression Cross validated Scores:\n" print scores1 model = GaussianNB() scores2=cv1(model,df1,y,cv=10) print "\nNaive Bayes Cross validated Scores:\n" print scores2 model = DecisionTreeClassifier() scores3=cv1(model,df1,y,cv=10) print "\nDecision Trees validated Scores:\n" print scores3 clf=LinearSVC()
def main(): "main program" app = get_app_title() appf = get_app_file() plotdir = make_plotdir() loans_df, loans_y, test_df, test_y, numeric_vars = load_data() indep_vars = numeric_vars # skip scaling for now, score 0.71 loans_X = loans_df test_X = test_df clf = lr() do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) plot_predict(plotdir, app, appf, "rawvar", indep_vars, test_df, test_y, pred_y) # add scaling, score 0.90 loans_X, my_scaler = scale_train_data(loans_df, print_out=True) test_X = scale_test_data(my_scaler, test_df) clf = lr() do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) plot_predict(plotdir, app, appf, "allvar", indep_vars, test_df, test_y, pred_y) print("columns:", indep_vars) # print_coefs(clf) X_labels = list(loans_df.columns) # print_lr_coefs(clf, X_labels) plist = print_lr_coefs(clf, indep_vars) # find score using only top6 top6 = [p[0] for p in plist[:6]] print("top6:", top6) loans_X = loans_df[top6] test_X = test_df[top6] loans_X, my_scaler = scale_train_data(loans_X, print_out=True) test_X = scale_test_data(my_scaler, test_X) clf = lr() do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) print_lr_coefs(clf, top6) plot_predict(plotdir, app, appf, "top6", top6, test_df, test_y, pred_y) do_roc(clf, test_X, test_y, "top6", top6, app, appf, plotdir) # arr = clf.decision_function(loans_df) # print("decision function:", arr.shape, arr) # shape (1873,) ## clf.decision_function(loans_df) # print_coefs(clf) # traditional coefs in "frequentist" style? # proba = clf.predict_proba(loans_X) # print("proba", proba.shape, proba) explore_params(loans_X, loans_y, plotdir, app, appf) # run optimization routine clf = lr() # init_list = [indep_vars[0], indep_vars[1]] # random_opt(clf, indep_vars, init_list, loans_df, loans_y, print_out=True) opt_score, opt_list = run_opt(clf, numeric_vars, loans_df, loans_y, app, appf, plotdir, rescale=True) # accuracy 73% +- 3% with no scaling (90% with scaling) # print_coefs(clf) # redo exploration with optimized columns loans_X = loans_df[opt_list] test_X = test_df[opt_list] loans_X, my_scaler = scale_train_data(loans_X, print_out=True) test_X = scale_test_data(my_scaler, test_X) # print("loans_X head\n", loans_X[:3]) explore_params(loans_X, loans_y, plotdir, app, appf+"opt_") # accuracy 73% due to no scaling clf = lr() cross_validate(clf, loans_X, loans_y, print_out=True) clf = lr() do_fit(clf, loans_X, loans_y, print_out=True) pred_y = do_predict(clf, test_X, test_y, print_out=True) print("opt_list columns:", opt_list) # print_coefs(clf) # print_lr_coefs(clf, X_labels) print_lr_coefs(clf, opt_list) plot_predict(plotdir, app, appf, "optvar", opt_list, test_df, test_y, pred_y)
print 'f1 macro:', res print # color = cm(1. * i / NUM_COLORS) # color will now be an RGBA tuple # cm = plt.get_cmap('gist_rainbow') # fig = plt.figure(figsize=(8.0, 5.0)) # ax = fig.add_subplot(111) # # ax.set_color_cycle([cm(1. * i / NUM_COLORS) for i in range(NUM_COLORS)]) # ax.plot(range(len(scores)), scores, label=str(threshold)) # ax.text(len(scores) - 1, scores[len(scores) - 1], threshold, fontsize='smaller') # plt.show() print name return res vec_list = [tf(), cv()] clf_list = [svc(), lr()] threshold_list = np.arange(0.5, 3, 0.5) print len(threshold_list) # results_size = (len(vec_list), len(clf_list),len(threshold_list)) # results = np.zeros(results_size, dtype = np.float) # a, b, c = range(3), range(3), range(3) # def my_func(x, y, z): # return (x + y + z) / 3.0, x * y * z, max(x, y, z) grids = np.vectorize(run)(*np.ix_(threshold_list, vec_list, clf_list)) # mean_grid, product_grid, max_grid = grids print len(grids) try: print grids.shape except: print type(grids)
x[:,16] = (x1**4)*x2 x[:,17] = (x1**3)*(x2**2) x[:,18] = (x1**2)*(x2**3) x[:,19] = x1*(x2**4) x[:,20] = x2**5 x[:,21] = x1**6 x[:,22] = (x1**5)*x2 x[:,23] = (x1**4)*(x2**2) x[:,24] = (x1**3)*(x2**3) x[:,25] = (x1**2)*(x2**4) x[:,26] = x1*(x2**5) x[:,27] = x2**6 return x data = np.loadtxt("data_microchip.txt",delimiter=",") m = data[:,0].size x1 = data[:,0] x2 = data[:,1] x = map_features(x1,x2,m) y = data[:,2] reg = lr(C=10) reg.fit(x,y) s = reg.coef_.size theta_ans = np.zeros((s+1)) theta_ans[0] = reg.intercept_[0] theta_ans[1:] = reg.coef_ theta_ans = theta_ans.reshape(s+1,1) print "%.2f%% accuracy"%(reg.score(x,y)*100)
import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression as lr data = np.loadtxt("ex1data1.txt",delimiter = ',') m = data[:,0].size x = data[:,0].reshape(m,1) y = data[:,1] a = lr(fit_intercept=True) a.fit(x,y) print a.coef_ print a.intercept_ print a.score(x,y) plt.scatter(x,y) plt.plot(x,a.predict(x)) plt.show()
train_data = np.load('train_data.npy') if load_saved: report = np.load("report.npy").item() rbm = RBM(len(train_data), report["n_hidden"], report["batch_size"]) rbm.W = report["W"] rbm.hbias = report["hbias"] rbm.vbias = report["vbias"] Y = np.argmax(train_data[:,:20], axis=1) train_data = train_data[:,20:] X = sigmoid(np.dot(train_data, rbm.W) + rbm.hbias) #X = train_data classifier = lr(0.01, solver = 'lbfgs', multi_class='multinomial') classifier.fit(X, Y) test_data = np.load('test_data.npy') test_X = sigmoid(np.dot(test_data, rbm.W) + rbm.hbias) #test_X = test_data pred = classifier.predict(test_X) train_ids, train_cuisines, train_ingredients = read_data('train.json') test_ids, test_cuisines, test_ingredients = read_data('test.json') del train_ids, train_ingredients, test_cuisines, test_ingredients le = LabelEncoder() le.fit(train_cuisines) pred = le.inverse_transform(pred) create_submission(test_ids, pred)
import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression as lr def show_scatter(): data_admitted = data[data[:,2]==1] data_notadmitted = data[data[:,2]==0] plt.scatter(data_admitted[:,0],data_admitted[:,1],c='r',s=50) plt.scatter(data_notadmitted[:,0],data_notadmitted[:,1],c='b',s=50) x_coordinates = [0,-theta_ans[0][0]/theta_ans[1][0]] y_coordinates = [-theta_ans[0][0]/theta_ans[2][0],0] plt.plot(x_coordinates,y_coordinates) plt.show() data = np.loadtxt("data_logistic_regression.txt",delimiter=",") m = data[:,0].size x = data[:,0:2] y = data[:,2] reg = lr(C=3.2) reg.fit(x,y) s = reg.coef_.size theta_ans = np.zeros((s+1)) theta_ans[0] = reg.intercept_[0] theta_ans[1:] = reg.coef_ theta_ans = theta_ans.reshape(s+1,1) print theta_ans print reg.score(x,y)*100,"% accuracy" show_scatter()
def train(self, train_X, train_Y): self.model = lr(penalty=self.hp['regularizer'], C=self.hp['alpha'], tol=self.hp['converg_tol']) self.model.fit(train_X, train_Y)
import pandas as pa from sklearn.linear_model import LinearRegression as lr import matplotlib.pyplot as plt import random random.seed(1) tabtrain = pa.read_csv('sources/train.csv') tabtest = pa.read_csv('sources/test.csv') # On forme les tableaux des features x_train = tabtrain.drop(['datetime','count','casual','registered'],1) x_test = tabtest.drop(['datetime'],1) # On forme les tableaux des résultats y_train = tabtrain['count'] model = lr(5) model.fit(x_train, y_train) y_test = model.predict(x_test) y_test = pa.DataFrame(y_test) y_test.index = tabtest['datetime'] print(y_test)
def logistic_regression_speed_test(dftrain, dftrain_y, plotdir): atitle = 'Logistic Regression' afile = 'logreg' clf = lr() # speed_test_medium(clf, dftrain, dftrain_y, atitle, afile, plotdir) speed_test_large(clf, dftrain, dftrain_y, atitle, afile, plotdir)
testDf = auxiliary.initialise_test(False) ids = testDf['Id'].values # Id,Dates,DayOfWeek,PdDistrict,Address,X,Y,Year,Week,Hour testDf = testDf.drop(['Id', 'Dates', 'Address', 'X', 'Y'], axis=1) # Random Forest Algorithm print list(trainDf.columns.values) print list(testDf.columns.values) #print list(trainDf.X.values) # back to numpy format trainData = trainDf.values testData = testDf.values print 'Training...' logit = lr() logit = logit.fit(trainData[0::,1::], trainData[0::,0]) print 'Predicting...' output = logit.predict_proba(testData).astype(float) output = output.tolist() predictions_file = open("../submissionLR.csv", "wb") open_file_object = csv.writer(predictions_file) open_file_object.writerow(["Id",'ARSON','ASSAULT','BAD CHECKS','BRIBERY','BURGLARY','DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE','DRUG/NARCOTIC','DRUNKENNESS','EMBEZZLEMENT','EXTORTION', 'FAMILY OFFENSES','FORGERY/COUNTERFEITING','FRAUD','GAMBLING','KIDNAPPING','LARCENY/THEFT', 'LIQUOR LAWS','LOITERING','MISSING PERSON','NON-CRIMINAL','OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT','PROSTITUTION','RECOVERED VEHICLE','ROBBERY','RUNAWAY', 'SECONDARY CODES','SEX OFFENSES FORCIBLE','SEX OFFENSES NON FORCIBLE','STOLEN PROPERTY', 'SUICIDE','SUSPICIOUS OCC','TREA','TRESPASS','VANDALISM','VEHICLE THEFT','WARRANTS',
import pandas as pa from sklearn.linear_model import LinearRegression as lr import matplotlib.pyplot as plt import random random.seed(1) tabtrain = pa.read_csv('sources/train.csv') tabtest = pa.read_csv('sources/test.csv') # On forme les tableaux des features x_train = tabtrain.drop(['datetime','count','casual','registered'],1) x_test = tabtest.drop(['datetime'],1) # On forme les tableaux des résultats y_train = tabtrain['count'] model = lr() model.fit(x_train, y_train) y_test = model.predict(x_test) y_test = pa.DataFrame(y_test) y_test.index = tabtest['datetime'] print(y_test)
x["miss"] = data.Name.map(lambda x:1 if x.lower().find("miss")>=0 else 0) x["master"] = data.Name.map(lambda x:1 if x.lower().find("master")>=0 else 0) x["embark_C"] = data.Embarked.map(lambda x:1 if x=="C" else 0) x["embark_Q"] = data.Embarked.map(lambda x:1 if x=="Q" else 0) x["embark_S"] = data.Embarked.map(lambda x:1 if x=="S" else 0) #return x p = poly(2, interaction_only=False) return p.fit_transform(x) if __name__ == "__main__": data = pd.read_csv("./data/train.csv") x = makeInput(data) y = data.Survived model = lr(C=0.2) model.fit(x,y) test_data = pd.read_csv("./data/test.csv") x_test = makeInput(test_data) predict = model.predict(x_test) predict = pd.Series(predict) y_test = pd.DataFrame({ "PassengerId": test_data.PassengerId ,"Survived": predict }) y_test.to_csv("./predict.csv", index=False)