def linearSVR(data): X = data.drop(["id", "date", "price","long","lat", "zipcode","yr_renovated", "sqft_above", "sqft_basement"], axis=1) y = data["price"] X_train, X_test, y_train, y_test = tts(X, y, test_size=0.10, random_state=42) svr = LinearSVR(random_state=42) svr.fit(X_train, y_train) y_predict = svr.predict(X_test) print "r2-score for LinearSVR: %f" % r2_score(y_test, y_predict)
def linearSVR(X, c_param, norm=2): if norm == 2: XX = normalizeL2(X) T = X.shape[0] # temporal length clf = LinearSVR(C=c_param, dual=False, loss='squared_epsilon_insensitive', \ epsilon=0.1, tol=0.001, verbose=False) # epsilon is "-p" in C's liblinear and tol is "-e" clf.fit(XX, np.linspace(1,T,T)) return clf.coef_
def train(self, trainSet): pntNum = trainSet.meanShape.shape[0] treeNum = int(self.maxTreeNum/pntNum) ### Train the random forests begTime = time.time() for i in xrange(pntNum): rf = RandForest(treeDepth = self.treeDepth, treeNum = treeNum, feaNum = self.feaNum, radius = self.radius, binNum = self.binNum, feaRange = self.feaRange) rf.train(trainSet, i) self.rfs.append(rf) elapse = getTimeByStamp(begTime, time.time(), 'min') print("\t\tRandom Forest : %f mins"%elapse) ### Extract the local binary features begTime = time.time() feas = self.genFeaOnTrainset(trainSet) elapse = getTimeByStamp(begTime, time.time(), 'min') print("\t\tExtract LBFs : %f mins"%elapse) ### Global regression begTime = time.time() y = trainSet.residuals y = y.reshape(y.shape[0], y.shape[1]*y.shape[2]) for i in xrange(pntNum*2): ### TODO Show the training result reg=LinearSVR(epsilon=0.0, C = 1.0/feas.shape[0], loss='squared_epsilon_insensitive', fit_intercept = True) reg.fit(feas, y[:, i]) self.regs.append(reg) elapse = getTimeByStamp(begTime, time.time(), 'min') print("\t\tGlobal Regression : %f mins"%elapse) ### Update the initshapes begTime = time.time() for i in xrange(pntNum): regX = self.regs[2*i] regY = self.regs[2*i+1] x = regX.predict(feas) y = regY.predict(feas) delta = NP.squeeze(NP.dstack((x,y))) delta = Affine.transPntsForwardWithDiffT(delta, trainSet.ms2reals) delta = NP.multiply(delta, trainSet.bndBoxs[:,[2,3]]) trainSet.initShapes[:,i,:] = trainSet.initShapes[:,i,:] + delta elapse = getTimeByStamp(begTime, time.time(), 'min') print("\t\tUpdate Shape : %f mins"%elapse)
def main(train_file, model_file): train_x, train_y = load_trainingData(train_file) #LR = LinearRegression(normalize = True) #LR = Ridge(alpha=0.5) #LR = SVR(C=1.0, epsilon=0.2, verbose = True) LR = LinearSVR(verbose = 1, epsilon = 0.1) logging("training model...") starttime = datetime.now() LR.fit(train_x, train_y) logging("training model, eplased time:%s" % str(datetime.now() - starttime)) logging("saving model") joblib.dump(LR, model_file)
def GlobalRegression(self, lbf, shape_residual): m = K n, f = lbf.shape # prepare linear regression X, Y X = lbf Y = shape_residual / img_o_width # parallel for i in xrange(landmark_n*2): reg = LinearSVR(epsilon=0.0, C=1.0/n, loss='squared_epsilon_insensitive', fit_intercept = True) reg.fit(X, Y[:, i]) self.w[i] = reg.coef_ self.w = self.w * img_o_width
class SVRR(object): def __init__(self, C): self.regression = LinearSVR(C=C) def fit(self, xs, ys): xs = xs.values ys = ys['y'] self.regression.fit(xs, ys) def predict(self, xs): xs = xs.values ys = self.regression.predict(xs) return ys
def globalRegress(self, posSet, negSet): self.feaDim = self.getFeaDim() ### Extract the local binary features begTime = time.time() posFeas = self.genFeaOnTrainset(posSet) negFeas = self.genFeaOnTrainset(negSet) t = getTimeByStamp(begTime, time.time(), 'min') print("\t\tExtract LBFs : %f mins"%t) ### Global regression begTime = time.time() y = posSet.residuals y = y.reshape(y.shape[0], y.shape[1]*y.shape[2]) for i in xrange(posSet.pntNum*2): ### TODO Show the training result reg=LinearSVR(epsilon=0.0, C = 1.0/posFeas.shape[0], loss='squared_epsilon_insensitive', fit_intercept = True) reg.fit(posFeas, y[:, i]) self.globalReg.append(reg) t = getTimeByStamp(begTime, time.time(), 'min') print("\t\tGlobal Regression : %f mins"%t) ### Update the initshapes begTime = time.time() for i in xrange(posSet.pntNum): regX = self.globalReg[2*i] regY = self.globalReg[2*i+1] x = regX.predict(posFeas) y = regY.predict(posFeas) delta = NP.squeeze(NP.dstack((x,y))) delta = NP.multiply(delta, posSet.winSize) posSet.initShapes[:,i,:] = posSet.initShapes[:,i,:] + delta x = regX.predict(negFeas) y = regY.predict(negFeas) delta = NP.squeeze(NP.dstack((x,y))) delta = NP.multiply(delta, negSet.winSize) negSet.initShapes[:,i,:] = negSet.initShapes[:,i,:] + delta t = getTimeByStamp(begTime, time.time(), 'min') self.applyPntOffsetIntoTree() print("\t\tUpdate Shape : %f mins"%t)
class LinearSVRPermuteCoef: def __init__(self, **kwargs): self.model = LinearSVR(**kwargs) def fit(self, X, y): self.model.fit(X, y) self.coef_ = self.model.coef_ self.intercept_ = self.model.intercept_ def add_coef(arr, fn): arr.append(fn(self.coef_)) add_coef(coeffs_state['max'], np.max) add_coef(coeffs_state['min'], np.min) return self def get_params(self, deep=True): return self.model.get_params(deep) def set_params(self, **kwargs): self.model.set_params(**kwargs) return self def predict(self, X): return self.model.predict(X) def score(self, X, y, sample_weight=None): if sample_weight is not None: return self.model.score(X, y, sample_weight) else: return self.model.score(X, y) @staticmethod def permute_min_coefs(): return coeffs_state['min'] @staticmethod def permute_max_coefs(): return coeffs_state['max'] @staticmethod def reset_perm_coefs(): coeffs_state['min'] = [] coeffs_state['max'] = []
def meta_model_fit(X_train, y_train, svm_hardness, fit_intercept, number_of_threads, regressor_type="LinearSVR"): """ Trains meta-labeler for predicting number of labels for each user. Based on: Tang, L., Rajan, S., & Narayanan, V. K. (2009, April). Large scale multi-label classification via metalabeler. In Proceedings of the 18th international conference on World wide web (pp. 211-220). ACM. """ if regressor_type == "LinearSVR": if X_train.shape[0] > X_train.shape[1]: dual = False else: dual = True model = LinearSVR(C=svm_hardness, random_state=0, dual=dual, fit_intercept=fit_intercept) y_train_meta = y_train.sum(axis=1) model.fit(X_train, y_train_meta) else: print("Invalid regressor type.") raise RuntimeError return model
def build_svm(x_train, y_train, x_test, y_test, n_features): """ Constructing a support vector regression model from input dataframe :param x_train: features dataframe for model training :param y_train: target dataframe for model training :param x_test: features dataframe for model testing :param y_test: target dataframe for model testing :return: None """ clf = LinearSVR(random_state=1, dual=False, epsilon=0, loss='squared_epsilon_insensitive') # Random state has int value for non-random sampling clf.fit(x_train, y_train) y_pred = clf.predict(x_test) # Mean absolute error regression loss mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred) # Mean squared error regression loss mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred) # Median absolute error regression loss median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred) # R^2 (coefficient of determination) regression score function r2 = sklearn.metrics.r2_score(y_test, y_pred) # Explained variance regression score function exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred) with open('../trained_networks/svm_%d_data.pkl' % n_features, 'wb') as results: pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL) pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL) pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL) pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL) return
del globals()['unqLikesLIDs'] del globals()['profilesDF'] del globals()['profiles'] del globals()['profilesLSo'] del globals()['profilesLS'] del globals()['row'] del globals()['tmpLS'] del globals()['tmpAGE'] del globals()['profsTOlikes'] del globals()['i'] del globals()['tmpIND'] seed = 7 myRand = np.random.seed(seed) X_train, X_test, y_train, y_test = train_test_split(likesMAT, consARR, test_size=1500) myTOL = float(sys.argv[1]) mySVM = LinearSVR(tol=myTOL) #mySVM.fit(likesMAT, consARR) mySVM.fit(X_train, y_train) y_pred = mySVM.predict(X_test) import math myRMSE = math.sqrt(metrics.mean_squared_error(y_test, y_pred)) print("cons, Linear SVM: ", str(myTOL), " ", myRMSE) # joblib.dump(mySVM, "/Users/jamster/LinearSVM-A-cons.xz", compress=9) # impSVM = joblib.load("/Users/jamster/LinearSVM-A-cons.xz")
def submit(feature_files, training_dates, feature_set_folder): train_set1 = pd.concat([ dfs(0, len(feature_files), feature_files + ['y'], 'dataset/' + date) for date in training_dates ]) train_set = train_set1[train_set1.time_diff > 15] test_set = train_set1[train_set1.time_diff <= 15] train_set = train_set.fillna(-1, downcast='infer') test_set = test_set.fillna(-1, downcast='infer') train_set['y_log'] = train_set['y'].apply(lambda x: np.log(1 + x)) test_set['y_log'] = test_set['y'].apply(lambda x: np.log(1 + x)) feature_set = filter( lambda x: x not in ['y', 'time', 'province', 'market', 'name', 'type', 'y_log'], train_set.columns) scaler = StandardScaler() scaler.fit(train_set[feature_set].as_matrix()) # # model1 model1 = LinearRegression(normalize=True) model1.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) print zip(feature_set, model1.coef_) test_set['predictY'] = model1.predict( scaler.transform(test_set[feature_set].as_matrix())) test_set.to_csv('result/' + feature_set_folder + '/model1_online_stacking1.csv') print test_set # model2 model2 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=6, colsample_bytree=0.7, subsample=0.7, colsample_bylevel=0.7) model2.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) test_set['predictY'] = model2.predict(test_set[feature_set].as_matrix()) test_set.to_csv('result/' + feature_set_folder + '/model2_online_stacking1.csv') # model3 model3 = LinearSVR(tol=1e-7) model3.fit(scaler.transform(train_set[feature_set].as_matrix()), train_set['y'].as_matrix(), sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) test_set['predictY'] = model3.predict( scaler.transform(test_set[feature_set].as_matrix())) test_set.to_csv('result/' + feature_set_folder + '/model3_offline.csv') # model4 model4 = RandomForestRegressor(n_estimators=1000, max_depth=7, max_features=0.2, max_leaf_nodes=100) model4.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), sample_weight=np.array( map(lambda x: 1.0 / x / x, train_set['y'].as_matrix()))) test_set['predictY'] = model4.predict(test_set[feature_set].as_matrix()) test_set.to_csv('result/' + feature_set_folder + '/model4_online_stacking1.csv') # model5 model5 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=6, colsample_bytree=0.7, subsample=0.7, colsample_bylevel=0.7, seed=10000) model5.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) test_set['predictY'] = model5.predict(test_set[feature_set].as_matrix()) test_set.to_csv('result/' + feature_set_folder + '/model5_online_stacking1.csv') # model6 model6 = XGBRegressor(n_estimators=600, learning_rate=0.01, max_depth=5, colsample_bytree=0.7, subsample=0.7, colsample_bylevel=0.7) model6.fit(train_set[feature_set].as_matrix(), train_set['y'].as_matrix(), sample_weight=map(lambda x: 1.0 / x / x, train_set['y'].as_matrix())) test_set['predictY'] = model6.predict(test_set[feature_set].as_matrix()) test_set.to_csv('result/' + feature_set_folder + '/model6_online_stacking1.csv') pass
import csv from sklearn.svm import LinearSVR as SVR from PIL import Image from hand_gen import get_eigenvalues # Train a linear SVR npzfile = np.load('hand_data.npz') X = npzfile['X'] y = npzfile['y'] # we already normalize these values in gen.py # X /= X.max(axis=0, keepdims=True) svr = SVR(C=1) svr.fit(X, y) # svr.get_params() to save the parameters # svr.set_params() to restore the parameters # predict # testdata = np.load('data.npz') testdata = [] ratio = 0.05 for i in range(1, 482): img = Image.open("data/hand/hand.seq%s.png" % str(i)) width = int(img.size[0] * ratio) height = int(img.size[1] * ratio) img = img.resize((width, height), Image.BILINEAR) img_data = np.array(img).flatten()
class Learner: """ Class responsible for training models, finding the best fit and making rate predictions based on the best fit model. """ def __init__(self, instrument, predictor): """ Initialize the Learner class based on a predictor and instrument. Args: instrument: Instrument object. predictor: Predictor object. """ self.instrument = instrument self.predictor = predictor self.init_learning_model() def init_learning_model(self): """ Initialize the learning model according to the given predictor. Args: None. """ if self.predictor.name == 'treeRegressor': self.model = DecisionTreeRegressor() if self.predictor.name == 'linearSVMRegressor': self.model = LinearSVR() def get_training_samples(self, end_date): """ Retrieve all training samples before the end date. Args: before: Date object. Retrieve training samples before end_date. Returns: all_samples: List of TrainingSample. """ last_date = None if end_date is not None: last_date = end_date - datetime.timedelta(1) all_samples = ts.get_samples(instrument=self.instrument, end=last_date, order_by=['date']) return all_samples def learn(self, **kwargs): """ Use the training samples for the given instrument to build a learning model for the learner. Args: Named arguments. cv_fold: Integer. Number of folds for cross validation. before: Date object. Use samples before this date. Returns: best_score: float. Best cross validation score from learning. """ cv_fold = kwargs.get('cv_fold') end_date = kwargs.get('before') all_training_samples = self.get_training_samples(end_date) features = [x.features for x in all_training_samples] targets = [x.target for x in all_training_samples] self.model.set_params(**self.predictor.parameters) scores = cross_val_score(self.model, features, targets, cv=cv_fold) ave_score = sum(scores) / len(scores) self.model.fit(features, targets) return ave_score def predict(self, features): """ Use trained model to predict profitable change given the features. Args: features: List of floats. Returns: Decimal. Predicted profitable change. """ features = np.asarray(features).reshape(1, -1) predicted = self.model.predict(features) return decimal.Decimal(float(predicted)).quantize(TWO_PLACES)
# for the loss: 'epsilon-insensitive loss' is for L1 and 'squared epsilon-insensitive loss' is for L2 # for L1, we can tune the epsilon value, for L2, we can tune the C value. # others are the defaults. regr = LinearSVR(epsilon=0.0, tol=0.0001, C=1.0, loss='squared_epsilon_insensitive', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=1000) # fit the model regr.fit(X_train, y_train) # get the prediction prediction_svm_p = regr.predict(X_test) # revert the prediction value prediction_svm_p_ori = prediction_svm_p * (y.max() - y.min()) + y.min() y_test_ori = np.array(y_test * (y.max() - y.min()) + y.min()) # get the score for this model score = regr.score(X_test, y_test) # calculate the mse value for the prediciton. mse_svm_p = np.mean((prediction_svm_p_ori - y_test_ori)**2) print("MSE with penalized SVM:", mse_svm_p) # plot the figure to see the difference between prediction and y_test. plt.plot(y_test_ori, label='y_test_ori')
class TextLearner(object): def __init__(self,data_path,model_path = "./",name = ""): self.name = name self.data_path = data_path self.model_path = model_path self.DesignMatrix = [] self.TestMatrix = [] self.X_train = [] self.y_train = [] # not only train but general purpose too self.X_test = [] self.y_test = [] self.y_pred = [] self.vectorizer = None self.feature_names = None self.chi2 = None self.mlModel = None self.F = Filter() def __enter__(self): return self def __exit__(self, type, value, traceback): self.DesignMatrix = [] self.TestMatrix = [] self.X_train = [] self.y_train = [] self.X_test = [] self.y_test = [] self.y_pred = [] self.vectorizer = None self.feature_names = None self.chi2 = None self.mlModel = None self.F = None def addModelDetails(self,model_p,name = ""): self.name = name self.model_path = model_p def load_data(self,TrTe = 0): #TrTe => 0-Train 1-Test # returns the dimensions of vectors with open( self.data_path, 'rb') as f: if TrTe == 0: self.DesignMatrix = pickle.load(f) return len(self.DesignMatrix[1]) if TrTe == 1: self.TestMatrix = pickle.load(f) return len(self.TestMatrix[1]) def clearOld(self): self.X_train = [] self.y_train = [] self.X_test = [] self.y_test = [] self.y_pred = [] self.vectorizer = None self.feature_names = None self.chi2 = None self.mlModel = None def process(self,text,default = 0): if default == 0: text = text.strip().lower().encode("utf-8") else: text = self.F.process(text) return text def loadXY(self,TrTe = 0,feature_index = 0,label_index = 1): #TrTe => 0-Train 1-Test if TrTe == 0: for i in self.DesignMatrix: self.X_train.append(self.process(i[feature_index])) self.y_train.append(i[label_index]) self.X_train = np.array(self.X_train) self.y_train = np.array(self.y_train) elif TrTe == 1: for i in self.TestMatrix: self.X_test.append(self.process(i[feature_index])) self.y_test.append(i[label_index]) self.X_test = np.array(self.X_test) self.y_test = np.array(self.y_test) def featurizeXY(self,only_train = 1): # Extracts Features sw = ['a', 'across', 'am', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'been', 'being', 'but', 'by', 'can', 'could', 'did', 'do', 'does', 'each', 'for', 'from', 'had', 'has', 'have', 'in', 'into', 'is', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'of', 'on', 'or', 'that', "that's", 'thats', 'the', 'there', "there's", 'theres', 'these', 'this', 'those', 'to', 'under', 'until', 'up', 'were', 'will', 'with', 'would'] self.vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words=sw) self.X_train = self.vectorizer.fit_transform(self.X_train) self.feature_names = self.vectorizer.get_feature_names() if only_train == 0: self.X_test = self.vectorizer.transform(self.X_test) def reduceDimension(self,only_train = 1, percent = 50): # Reduce dimensions / self best of features n_samples, n_features = self.X_train.shape k = int(n_features*(percent/100)) self.chi2 = SelectKBest(chi2, k=k) self.X_train = self.chi2.fit_transform(self.X_train, self.y_train) self.feature_names = [self.feature_names[i] for i in self.chi2.get_support(indices=True)] self.feature_names = np.asarray(self.feature_names) if only_train == 0: self.X_test = self.chi2.transform(self.X_test) def trainModel(self,Model = "default"): if Model == "default": self.mlModel = LinearSVR(loss='squared_epsilon_insensitive',dual=False, tol=1e-3) else: self.mlModel = Model self.mlModel.fit(self.X_train, self.y_train) def testModel(self,approx = 1): # returns score ONLY self.y_pred = np.array(self.mlModel.predict(self.X_test)) if approx == 1: ### To convert real valued results to binary for scoring temp = [] for y in self.y_pred: if y > 0.0: temp.append(1.0) else: temp.append(-1.0) self.y_pred = temp return metrics.accuracy_score(self.y_test, self.y_pred) def getReport(self,save = 1, get_top_words = 0): # returns report report = "" if get_top_words == 1: if hasattr(self.mlModel, 'coef_'): report += "Dimensionality: " + str(self.mlModel.coef_.shape[1]) report += "\nDensity: " + str(density(self.mlModel.coef_)) rank = np.argsort(self.mlModel.coef_[0]) top10 = rank[-20:] bottom10 = rank[:20] report += "\n\nTop 10 keywords: " report += "\nPositive: " + (" ".join(self.feature_names[top10])) report += "\nNegative: " + (" ".join(self.feature_names[bottom10])) score = metrics.accuracy_score(self.y_test, self.y_pred) report += "\n\nAccuracy: " + str(score) report += "\nClassification report: " report += "\n\n" + str(metrics.classification_report(self.y_test, self.y_pred,target_names=["Negative","Positive"])) report += "\nConfusion matrix: " report += "\n\n" + str(metrics.confusion_matrix(self.y_test, self.y_pred)) + "\n\n" if save == 1: with open(self.model_path + "report.txt", "w") as text_file: text_file.write(report) return report def crossVal(self,folds = 5, dim_red = 50,full_iter = 0, save = 1): # returns report # Caution: resets train and test X,y skf = cross_validation.StratifiedKFold(self.y_train, n_folds = folds,shuffle=True) print(skf) master_report = "" X_copy = self.X_train y_copy = self.y_train for train_index, test_index in skf: self.X_train, self.X_test = X_copy[train_index], X_copy[test_index] self.y_train, self.y_test = y_copy[train_index], y_copy[test_index] self.featurizeXY(0) self.reduceDimension(0,dim_red) self.trainModel() self.testModel() master_report += self.getReport(save = 0,get_top_words = 0) if full_iter == 1: continue else: break if save == 1: with open(self.model_path + "master_report.txt", "w") as text_file: text_file.write(master_report) return master_report def save_obj(self,obj, name ): with open(self.model_path + name + '.pkl', 'wb') as f: pickle.dump(obj, f, protocol=2) def saveModel(self): # saves in model path self.save_obj(self.mlModel, self.name + "_model") self.save_obj(self.vectorizer, self.name + "_vectorizer") self.save_obj(self.chi2, self.name + "_feature_selector") def plot(self): ''' beta (Just plotting the model) (Not working) ''' h = .02 # step size in the mesh # create a mesh to plot in x_min, x_max = self.X_train[:, 0].min() - 1, self.X_train[:, 0].max() + 1 y_min, y_max = self.X_train[:, 1].min() - 1, self.X_train[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h)) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. Z = self.mlModel.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.contour(xx, yy, Z, cmap=plt.cm.Paired) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.xticks(()) plt.yticks(()) plt.title(self.name) plt.savefig(self.model_path + 'plot.png')
coef0=1, C=5)))) poly_kernel_svm_clf.fit(X, y) rbf_kernel_svm_clf = Pipeline( (("scaler", StandardScaler()), ("svm_clf", SVC(kernel="rbf", gamma=5, C=0.001)))) rbf_kernel_svm_clf.fit(X, y) """ LinearSVC比SVC快得多(ker nel =“linear”)),特别是如果训练集非常大或者它有很多特征。 如果训练集不太大,则应该尝试高斯RBF内核;它在大多数情况下运作良好。 """ if False: from sklearn.svm import LinearSVR """ epsilon -> street width C large regularization small """ svm_reg = LinearSVR(epsilon=1.5) svm_reg.fit(X, y) """ SVR类是SVC类的回归等价物,LinearSVR类是LinearSVC类的回归等价物。 LinearSVR类与训练集的大小成线性关系(就像LinearSVC类一样),而当训练集变大时SVR类变得太慢(就像SVC类一样) """ from sklearn.svm import SVR svm_poly_reg = SVR(kernel="poly", degree=2, C=100, epsilon=0.1) svm_poly_reg.fit(X, y)
# tuning: # increasing gamma makes the bell curves narrower, each instance less influencing - the decision boundary is wiggly # decreasing gamma broadens the bell shape, the decision boundary is smoother # if overfitting reduce gamma, if underitting, increase gamma # string kernel for text data ########################### SVM REGRESSION ######################################################### ## it tries to balance the opposite of SVM classifier ## it tries to fit as many instances as possible on the "street" ## while limiting margin violations # the width of the street is controlled by epsilon (margin) # epsilon-insensitive regression: adding more training instances within the margins doesn't influence the from sklearn.svm import LinearSVR svm_reg = LinearSVR(epsilon = 1.5) svm_reg.fit(x,y) # C can be used as a regulator hyperparameter, by decreasing C we apply more regularization # nonlinearity with polynomial from sklearn.svm import SVR svm_poly_reg = SVR(kernel = "poly", degree = 2, epsilon=0.1, C=100) svm_poly_reg.fit(x,y) ############## SVM in detail #################### # notation: # b - the bias term # w - the feature weights # 1. linear SVM CLF
from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error from sklearn.linear_model import SGDRegressor #Importing the dataset df = pd.read_csv("finalEncoded.csv") y = df['price'] X = df.drop(columns=['price']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) svr = LinearSVR(epsilon=0.01, C=0.01, fit_intercept=True) svr.fit(X_train, y_train) def svr_results(y_test, X_test, fitted_svr_model): print("C: {}".format(fitted_svr_model.C)) print("Epsilon: {}".format(fitted_svr_model.epsilon)) print("Intercept: {:,.3f}".format(fitted_svr_model.intercept_[0])) print("Coefficient: {:,.3f}".format(fitted_svr_model.coef_[0])) mae = mean_absolute_error(y_test, fitted_svr_model.predict(X_test)) print("MAE = ${:,.2f}".format(1000*mae)) perc_within_eps = 100*np.sum(y_test - fitted_svr_model.predict(X_test) < 5) / len(y_test) print("Percentage within Epsilon = {:,.2f}%".format(perc_within_eps))
from sklearn.svm import LinearSVR # 导入线性回归类 from sklearn.datasets import load_boston # 导入加载波士顿数据集 from pandas import DataFrame # 导入DataFrame boston = load_boston() # 创建加载波士顿数据对象 # 将波士顿房价数据创建为DataFrame对象 df = DataFrame(boston.data, columns=boston.feature_names) df.insert(0, 'target', boston.target) # 将价格添加至DataFrame对象中 data_mean = df.mean() # 获取平均值 data_std = df.std() # 获取标准偏差 data_train = (df - data_mean) / data_std # 数据标准化 x_train = data_train[boston.feature_names].values # 特征数据 y_train = data_train['target'].values # 目标数据 linearsvr = LinearSVR(C=0.1) # 创建LinearSVR()对象 linearsvr.fit(x_train, y_train) # 训练模型 # 预测,并还原结果 x = ((df[boston.feature_names] - data_mean[boston.feature_names]) / data_std[boston.feature_names]).values # 添加预测房价的信息列 df[u'y_pred'] = linearsvr.predict(x) * data_std['target'] + data_mean['target'] print(df[['target', 'y_pred']].head()) #输出真实价格与预测价格
plt.title("SVC_RBF") boundary(svc_rbf, [4, 8.5, 1.75, 4.75]) plt.scatter(X[y == 0, 0], X[y == 0, 1]) plt.scatter(X[y == 1, 0], X[y == 1, 1]) plt.scatter(X[y == 2, 0], X[y == 2, 1]) plt.axis([4, 8.5, 1.75, 4.75]) # 回归------------------------------------------------------------------------------------------------------------------ # 线性回归 x = np.linspace(0, 100, 100) X = x.reshape(-1, 1) y = 2 * x + 5 + np.random.uniform(-10, 10, 100) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) svr_line = LinearSVR(max_iter=1e5) svr_line.fit(X_train, y_train) y_predict = svr_line.predict(X_test) print("svr_line r2_score =", r2_score(y_test, y_predict)) plt.sca(ax3) plt.title("LinearSVR") plt.scatter(X_train, y_train, c='b') plt.plot(X_test, y_predict, c='r') # 非线性回归 x = np.linspace(-2, 2, 100) X = x.reshape(-1, 1) y = 0.5 * x ** 2 + 2 * x + 3 + np.random.normal(0, 0.5, 100) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) svr_rbf = SVR(kernel="rbf", gamma=0.5)
def linearSVR(XTrain, YTrain, XTest, YTest, **options): import numpy as np from sklearn.svm import LinearSVR # from sklearn import svm # linear_svr = []; # Create a classifier: a support vector classifier ''' if options.get('l1'): l1 = options.get('l1'); #print 'running l1 svm classification\r' linear_svr = svm.LinearSVR(C = l1, loss='squared_hinge', penalty='l1', dual=False) elif options.get('l2'): l2 = options.get('l2'); #print 'running l2 svm classification\r' linear_svr = svm.LinearSVR(C = l2, loss='squared_hinge', penalty='l2', dual=True) ''' l = options.get('l'); linear_svr = LinearSVR(C=l, epsilon=0.0, dual = True, tol = 1e-9, fit_intercept = True) linear_svr.fit(XTrain, np.squeeze(YTrain)) #%% # def perClassError_sr(Y,Yhat,eps0=10**-5): # ce = np.mean(np.logical_and(abs(Y-Yhat) > eps0 , ~np.isnan(Yhat - Y)))*100 # return ce def perClassError_sr(y,yhat): err = np.linalg.norm(yhat - y)**2 maxerr = np.linalg.norm(y+1e-10)**2 # err = (np.linalg.norm(yhat - y)**2)/len(y) # maxerr = np.linalg.norm(y)**2 # ce = err ce = err/ maxerr # ce = np.linalg.norm(yhat - y)**2 / len(y) return ce perClassErrorTest = perClassError_sr(YTest, linear_svr.predict(XTest)); perClassErrorTrain = perClassError_sr(YTrain, linear_svr.predict(XTrain)); #%% class summaryClass: perClassErrorTrain = []; perClassErrorTest = []; model = []; XTest = [] XTrain = [] YTest = [] YTrain = [] summary = summaryClass(); summary.perClassErrorTrain = perClassErrorTrain; summary.perClassErrorTest = perClassErrorTest; summary.model = linear_svr; summary.XTest = XTest summary.XTrain = XTrain summary.YTest = YTest summary.YTrain = YTrain return summary # np.mean()
print "----------- Fold %d -----------------------" %i print "--------------------------------------------" val_id = fold_ids.ix[:, i].dropna() idx = train["Id"].isin(list(val_id)) trainingSet = train[~idx] validationSet = train[idx] tr_X = np.matrix(trainingSet[feature_names]) tr_Y = np.array(trainingSet["Response"]) val_X = np.matrix(validationSet[feature_names]) val_Y = np.array(validationSet["Response"]) regm = LinearSVR(C = 0.06, epsilon = 0.45, tol = 1e-5, dual = True, verbose = True, random_state = 133) regm.fit(tr_X, tr_Y) preds = regm.predict(val_X) df = pd.DataFrame(dict({"Id" : validationSet["Id"], "ground_truth" : validationSet["Response"], "linsvr_preds" : preds})) linsvr_val = linsvr_val.append(df, ignore_index = True) tpreds = regm.predict(test_X) cname = "Fold" + `i` linsvr_test[cname] = tpreds linsvr_val.to_csv("ensemble2/linsvr_val.csv") linsvr_test.to_csv("ensemble2/linsvr_test.csv")
train_data.drop(['嗜碱细胞%'], axis=1, inplace=True) test_dataA.drop(['嗜碱细胞%'], axis=1, inplace=True) #对列的空值进行填充 for i in train_data.columns: train_data[i].fillna(train_data[i].mean(), inplace=True) for i in test_dataA: test_dataA[i].fillna(test_dataA[i].mean(), inplace=True) train_data_y = train_data['血糖'] train_data.drop(['血糖'], axis=1, inplace=True) #归一化 scaler = StandardScaler() train_data = scaler.fit_transform(train_data.astype(float)) test_dataA = scaler.transform(test_dataA.astype(float)) #建立模型 lin_svr = LinearSVR(random_state=42, max_iter=5000) lin_svr.fit(train_data, train_data_y) test_features_labers = lin_svr.predict(test_dataA) #评估模型 mse = mean_squared_error(test_labels, test_features_labers) print("平均偏离值: %f " % (mse)) print("均方误差:%f" % (np.sqrt(mse))) #使用RandomizedSearchCV param_distributions = { 'gamma': reciprocal([0.001, 0.1]), 'C': [uniform(1, 10), uniform(1, 10)] } rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=10, verbose=2, cv=3,
data,target = [],[] for row in csv.reader(data_file): data += [[row[0],row[4],row[6],row[10]]] target += [row[9]] data,target = Lin_clean_data(data[1:],target[1:],2) point = 2000 X_train = data[:point-1] X_test = data[point:point+int(point*0.2)] y_train = target[:point-1] y_test = target[point:point+int(point*0.2)] svr = LinearSVR(C=0.1) svr_model = svr.fit(X_train,y_train) lin = svr.predict(X_train) lin_test = svr.predict(X_test) lin,lin_test = data_normalize(y_train,y_test,lin,lin_test) print("Train score : ",score(y_train,lin)) print("Train average error : ",sum(abs(y_train-lin)) / float(len(y_train))) print("Fit score : ",score(y_test,lin_test)) print("Fit average error : ",sum(abs(y_test-lin_test)) / float(len(y_test))) figure1 = plt.figure(1,figsize=[20,10]) draw_pic(range(len(X_train)),range(len(X_test)),lin,lin_test,y_train,y_test,label='lin',figure=figure1) figure1.savefig("C:/Users/sean/Desktop/SVR_DATA/linSVR.png",dpi=300,format="png") plt.close(1)
class AllRegressionModels: """ Wrapper class around all supported regression models: LinearRegression, RandomForest, SVR, NuSVR, LinearSVR, and XGBRegressor. AllRegressionModels runs every available regression algorithm on the given dataset and outputs the coefficient of determination and execution time of each successful model when all_regression_models() is run. """ def __init__(self, attributes=None, labels=None, test_size=0.25, verbose=False): """ Initializes an AllRegressionModels object. The following parameters are needed to use an AllRegressionModels object: – attributes: a numpy array of the desired independent variables (Default is None) – labels: a numpy array of the desired dependent variables (Default is None) – test_size: the proportion of the dataset to be used for testing the model; the proportion of the dataset to be used for training will be the complement of test_size (Default is 0.25) – verbose: specifies whether or not to ouput any and all logging during model training (Default is False) Note: These are the only parameters allowed. All other parameters for each model will use their default values. For more granular control, please instantiate each model individually. The following instance data is found after running all_regression_models() successfully: – linear_regression: a reference to the LinearRegression model – random_forest: a reference to the RandomForest model – SVR: a reference to the SVR model – nu_SVR: a reference to the NuSVR model – linear_SVR: a reference to the LinearSVR model – XGB_regressor: a reference to the XGBRegressor model After running all_regression_models(), the coefficient of determination and execution time for each model that ran successfully will be displayed in tabular form. Any models that failed to run will be listed. """ self.attributes = attributes self.labels = labels self.test_size = test_size self.verbose = verbose self.linear_regression = LinearRegression() self.random_forest = RandomForestRegressor(verbose=self.verbose) self.SVR = SVR(verbose=self.verbose) self.nu_SVR = NuSVR(verbose=self.verbose) self.linear_SVR = LinearSVR(verbose=self.verbose) self.XGB_regressor = XGBRegressor(verbosity=int(self.verbose)) self._regression_models = {"Model": ["R2 Score", "Time"]} self._failures = [] # Accessor methods def get_attributes(self): """ Accessor method for attributes. If an AllRegressionModels object is initialized without specifying attributes, attributes will be None. all_regression_models() cannot be called until attributes is a populated numpy array of independent variables; call set_attributes(new_attributes) to fix this. """ return self.attributes def get_labels(self): """ Accessor method for labels. If an AllRegressionModels object is initialized without specifying labels, labels will be None. all_regression_models() cannot be called until labels is a populated numpy array of dependent variables; call set_labels(new_labels) to fix this. """ return self.labels def get_test_size(self): """ Accessor method for test_size. Should return a number or None. """ return self.test_size def get_verbose(self): """ Accessor method for verbose. Will default to False if not set by the user. """ return self.verbose def get_all_regression_models(self): """ Accessor method that returns a list of all models. All models within the list will be None if all_regression_models() hasn't been called, yet. """ return [self.linear_regression, self.random_forest, self.SVR, self.nu_SVR, self.linear_SVR, self.XGB_regressor] def get_linear_regression(self): """ Accessor method for linear_regression. Will return None if all_regression_models() hasn't been called, yet. """ return self.linear_regression def get_random_forest(self): """ Accessor method for random_forest. Will return None if all_regression_models() hasn't been called, yet. """ return self.random_forest def get_SVR(self): """ Accessor method for SVR. Will return None if all_regression_models() hasn't been called, yet. """ return self.SVR def get_nu_SVR(self): """ Accessor method for nu_SVR. Will return None if all_regression_models() hasn't been called, yet. """ return self.nu_SVR def get_linear_SVR(self): """ Accessor method for linear_SVR. Will return None if all_regression_models() hasn't been called, yet. """ return self.linear_SVR def get_XGB_regressor(self): """ Accessor method for XGB_regressor. Will return None if all_regression_models() hasn't been called, yet. """ return self.XGB_regressor # Modifier methods def set_attributes(self, new_attributes=None): """ Modifier method for attributes. Input should be a numpy array of independent variables. Defaults to None. """ self.attributes = new_attributes def set_labels(self, new_labels=None): """ Modifier method for labels. Input should be a numpy array of dependent variables. Defaults to None. """ self.labels = new_labels def set_test_size(self, new_test_size=0.25): """ Modifier method for test_size. Input should be a number or None. Defaults to 0.25. """ self.test_size = new_test_size def set_verbose(self, new_verbose=False): """ Modifier method for verbose. Input should be a truthy/falsy value. Defaults to False. """ self.verbose = new_verbose # Regression functionality def all_regression_models(self): """ Driver method for running all regression models with given attributes and labels. all_regression_models() first trains the models and determines their coefficients of determination and execution time via _all_regression_models_runner(). Then, all_regression_models() calls _print_results() to format and print each successful model's measurements, while also listing any failed models. If verbose is True, all verbose logging for each model will be enabled. If verbose is False, all logging to stdout and stderr will be suppressed. """ # Call helper method for running all regression models; suppress output, if needed if not self.verbose: suppress_output = io.StringIO() with redirect_stderr(suppress_output), redirect_stdout(suppress_output): self._all_regression_models_runner() else: self._all_regression_models_runner() # Print results self._print_results() # Helper methods def _all_regression_models_runner(self): """ Helper method that runs all models using the given dataset and all default parameters. After running all models, each model is determined to be either a success or failure, and relevant data (R2 score, execution time) is recorded. _all_regression_models_runner() may only be called by all_regression_models(). """ # Split dataset dataset_X_train, dataset_X_test, dataset_y_train, dataset_y_test =\ train_test_split(self.attributes, self.labels, test_size=self.test_size) # Run and time all models; identify each as success or failure try: start_time = time.time() self.linear_regression.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._regression_models["LinearRegression"] =\ [self.linear_regression.score(dataset_X_test, dataset_y_test), end_time - start_time] except: self._failures.append("LinearRegression") try: start_time = time.time() self.random_forest.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._regression_models["RandomForest"] =\ [self.random_forest.score(dataset_X_test, dataset_y_test), end_time - start_time] except: self._failures.append("RandomForest") try: start_time = time.time() self.SVR.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._regression_models["SVR"] = [self.SVR.score(dataset_X_test, dataset_y_test), end_time - start_time] except: self._failures.append("SVR") try: start_time = time.time() self.nu_SVR.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._regression_models["NuSVR"] = [self.nu_SVR.score(dataset_X_test, dataset_y_test), end_time - start_time] except: self._failures.append("NuSVR") try: start_time = time.time() self.linear_SVR.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._regression_models["LinearSVR"] =\ [self.linear_SVR.score(dataset_X_test, dataset_y_test), end_time - start_time] except: self._failures.append("LinearSVR") try: start_time = time.time() self.XGB_regressor.fit(dataset_X_train, dataset_y_train) end_time = time.time() self._regression_models["XGBRegressor"] =\ [self.XGB_regressor.score(dataset_X_test, dataset_y_test), end_time - start_time] except: self._failures.append("XGBRegressor") def _print_results(self): """ Helper method that prints results of _all_regression_models_runner() in tabular form. _print_results() may only be called by all_regression_models() after all models have attempted to run. """ # Print models that didn't fail print("\nResults:\n") for model, data in self._regression_models.items(): print("{:<20} {:<20} {:<20}".format(model, data[0], data[1])) print() # Print failures, if any if len(self._failures) > 0: print("The following models failed to run:\n") for entry in self._failures: print(entry) print()
from sklearn.svm import LinearSVR from sklearn.datasets import make_regression import matplotlib.pyplot as plt import random from matplotlib.pyplot import figure test_data = pd.read_csv("Sample.csv") reference = pd.read_csv("Reference.csv") train = reference.drop("MF_name",1) test_data = test_data.drop("gene",1) score_adj = [] for o in range(len(test_data.columns)): test = test_data.loc[:,test_data.columns[o]] im_name = train.columns svr = LinearSVR(random_state=0) model = svr.fit(train, test) score = model.coef_ score[np.where(score<0)] = 0 score_adj.append((score/sum(score))) score_adj = pd.DataFrame(score_adj) score_adj.columns = im_name score_adj.plot(kind='bar', stacked=True,legend=False) plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5)) plt.suptitle("Flow&Estimate") plt.rcParams['figure.figsize'] = (6.69,8.86) plt.rcParams['figure.dpi'] = 300 name = "bar.pdf" plt.savefig(name,bbox_inches="tight" ) plt.close() plt.boxplot(score_adj.T,patch_artist = True) plt.suptitle("Flow&Estimate")
def train_linear(x, y): model_linear = LinearSVR(C=1, tol=1e-5, max_iter=1000) model_linear.fit(x, y) return model_linear
print '' lsvc = LinearSVC() print 'LinearSVC config:' print lsvc.get_params() lsvc.fit(smr_train.feature_matrix, smr_train.labels) lsvc_score_train = lsvc.score(smr_train.feature_matrix, smr_train.labels) print 'LinearSVC precision train: {}'.format(lsvc_score_train) lsvc_score_test = lsvc.score(smr_test.feature_matrix, smr_test.labels) print 'LinearSVC precision test: {}'.format(lsvc_score_test) print '' lsvr = LinearSVR() print 'LinearSVR config:' print svc.get_params() lsvr.fit(smr_train.feature_matrix, smr_train.labels) lsvr_score_train = svc.score(smr_train.feature_matrix, smr_train.labels) print 'LinearSVR precision train: {}'.format(lsvr_score_train) lsvr_score_test = lsvr.score(smr_test.feature_matrix, smr_test.labels) print 'LinearSVR precision test: {}'.format(lsvr_score_test) print '' nusvc = NuSVC() print 'NuSVC config:' print nusvc.get_params() nusvc.fit(smr_train.feature_matrix, smr_train.labels) nusvc_score_train = nusvc.score(smr_train.feature_matrix, smr_train.labels) print 'NuSVC precision train: {}'.format(nusvc_score_train) nusvc_score_test = nusvc.score(smr_test.feature_matrix, smr_test.labels) print 'NuSVC precision test: {}'.format(nusvc_score_test) print ''
#dictvalues ={} #for coldata in data_new.columns: # dictvalues[coldata] = datatest[coldata].mean() #print('dictvalues values') #print(dictvalues) ##print('sorted output') #from operator import itemgetter #print(sorted(dictvalues.items(), key=itemgetter(1),reverse=True)) #regr = linear_model.Lasso(alpha=0.1) regr = LinearSVR(C=1.0, epsilon=0.2) #regr = RandomForestRegressor() #regr = AdaBoostRegressor(n_estimators=80) regr.fit(data_new[features], y) predictions = regr.predict(datatest) print('predictions') print(predictions) datatest_result = pd.read_csv('test.csv',header=0) datatest_result['loss'] = np.exp(predictions) header = ["id","loss"] datatest_result.to_csv("Results_AllState_SVR_81.csv", sep=',', columns = header,index=False) for col in data.columns[:-1]: print(data[col].unique())
## KneighborsRegressor from sklearn.neighbors import KNeighborsRegressor knreg = KNeighborsRegressor(n_neighbors=5) knreg.fit(X_train, y_train) score_list.append(knreg.score(X_test, y_test)) ## Support Vector Regressor from sklearn.svm import SVR svm_reg = SVR(kernel='poly', gamma='auto', degree=2, C=5, epsilon=0.1) svm_reg.fit(X_train, y_train) score_list.append(svm_reg.score(X_test, y_test)) ## linearSVR from sklearn.svm import LinearSVR sv_reg = LinearSVR(max_iter=1000) sv_reg.fit(X_train, y_train) score_list.append(sv_reg.score(X_test, y_test)) ## random forest from sklearn.ensemble import RandomForestRegressor rf_reg = RandomForestRegressor(max_depth=5) rf_reg.fit(X_train, y_train) score_list.append(rf_reg.score(X_test, y_test)) ''' ## LightGBM import lightgbm as lgb lgb_reg=lgb.LGBMRegressor(objective='regression') lgb_reg.fit(X_train, y_train) score_list.append(lgb_reg.score(X_test, y_test)) ''' '''
def default_datasets(carrier, id_airport): # # **Predicting flight delays** # In this notebook, we developed the model aimed at predicting flight delays at take-off. # During the EDA, we intended to create good quality figures # This notebook is composed of three parts: # Cleaning # * Date and Times # * Missing Values # Exploration # * Graphs # * Impact of Departure Vs Arrival Delays # Modeling # The model is developed for one airport and one airline # * Linear # * Ridge # * Random Forest # * Neural Networks # * SVM # In[2]: import datetime, warnings, scipy import pandas as pd import numpy as np import seaborn as sns import matplotlib as mpl import matplotlib.pyplot as plt import matplotlib.patches as patches from matplotlib.patches import ConnectionPatch from collections import OrderedDict from matplotlib.gridspec import GridSpec from sklearn import metrics, linear_model from sklearn.preprocessing import PolynomialFeatures, StandardScaler from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict from scipy.optimize import curve_fit from sklearn.metrics import r2_score from random import sample import matplotlib.patches as mpatches from sklearn.linear_model import Ridge from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import r2_score from scipy.stats import spearmanr, pearsonr from sklearn.svm import SVR plt.rcParams["patch.force_edgecolor"] = True plt.style.use('fivethirtyeight') mpl.rc('patch', edgecolor='dimgray', linewidth=1) from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "last_expr" pd.options.display.max_columns = 50 #get_ipython().magic('matplotlib inline') warnings.filterwarnings("ignore") # In[2]: df = pd.read_csv( '/Users/sarveshprattipati/Downloads/flight-delays/flights.csv', low_memory=False) print('Dataframe dimensions:', df.shape) airports = pd.read_csv( "/Users/sarveshprattipati/Downloads/flight-delays/airports.csv") airlines_names = pd.read_csv( '/Users/sarveshprattipati/Downloads/flight-delays/airlines.csv') airlines_names abbr_companies = airlines_names.set_index('IATA_CODE')['AIRLINE'].to_dict() carrier = 'AA' id_airport = 'DFW' # %% # # 1. Cleaning # # 1.1 Dates and times # # **YEAR, MONTH, DAY**, is merged into date column df['DATE'] = pd.to_datetime(df[['YEAR', 'MONTH', 'DAY']]) # Moreover, in the **SCHEDULED_DEPARTURE** variable, the hour of the take-off is coded as a float where the two first digits indicate the hour and the two last, the minutes. This format is not convenient and I thus convert it. Finally, I merge the take-off hour with the flight date. To proceed with these transformations, I define a few functions: # Function that converts the 'HHMM' string to datetime.time def format_heure(chaine): if pd.isnull(chaine): return np.nan else: if chaine == 2400: chaine = 0 chaine = "{0:04d}".format(int(chaine)) heure = datetime.time(int(chaine[0:2]), int(chaine[2:4])) return heure # Function that combines a date and time to produce a datetime.datetime def combine_date_heure(x): if pd.isnull(x[0]) or pd.isnull(x[1]): return np.nan else: return datetime.datetime.combine(x[0], x[1]) # Function that combine two columns of the dataframe to create a datetime format def create_flight_time(df, col): liste = [] for index, cols in df[['DATE', col]].iterrows(): if pd.isnull(cols[1]): liste.append(np.nan) elif float(cols[1]) == 2400: cols[0] += datetime.timedelta(days=1) cols[1] = datetime.time(0, 0) liste.append(combine_date_heure(cols)) else: cols[1] = format_heure(cols[1]) liste.append(combine_date_heure(cols)) return pd.Series(liste) df['SCHEDULED_DEPARTURE'] = create_flight_time(df, 'SCHEDULED_DEPARTURE') df['DEPARTURE_TIME'] = df['DEPARTURE_TIME'].apply(format_heure) df['SCHEDULED_ARRIVAL'] = df['SCHEDULED_ARRIVAL'].apply(format_heure) df['ARRIVAL_TIME'] = df['ARRIVAL_TIME'].apply(format_heure) # __________________________________________________________________________ # df.loc[:5, ['SCHEDULED_DEPARTURE', 'SCHEDULED_ARRIVAL', 'DEPARTURE_TIME', # 'ARRIVAL_TIME', 'DEPARTURE_DELAY', 'ARRIVAL_DELAY']] # The content of the **DEPARTURE_TIME** and **ARRIVAL_TIME** variables can be a bit misleading. # the first entry of the dataframe, the scheduled departure is at 0h05 the 1st of January. # ### 1.2 Filling factor # # Finally, the data frame is cleaned and few columns are dropped variables_to_remove = [ 'TAXI_OUT', 'TAXI_IN', 'WHEELS_ON', 'WHEELS_OFF', 'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'DATE', 'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON', 'FLIGHT_NUMBER', 'TAIL_NUMBER', 'AIR_TIME' ] df.drop(variables_to_remove, axis=1, inplace=True) df = df[[ 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME', 'ARRIVAL_DELAY', 'SCHEDULED_TIME', 'ELAPSED_TIME' ]] # df[:5] missing_df = df.isnull().sum(axis=0).reset_index() missing_df.columns = ['variable', 'missing values'] missing_df['filling factor (%)'] = ( df.shape[0] - missing_df['missing values']) / df.shape[0] * 100 missing_df.sort_values('filling factor (%)').reset_index(drop=True) # The filling factor is quite good (> 97%). So dropping the rows with NA is a good option df.dropna(inplace=True) # %% # # 2. Exploration # # 2.1 Basic statistical description of airlines # function for statistical parameters from a grouby object: def get_stats(group): return { 'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean() } global_stats = df['DEPARTURE_DELAY'].groupby( df['AIRLINE']).apply(get_stats).unstack() global_stats = global_stats.sort_values('count') global_stats # In[15]: # # 2.1 Graphs # Pie chart for font = {'family': 'normal', 'weight': 'bold', 'size': 15} mpl.rc('font', **font) # __________________________________________________________________ # I extract a subset of columns and redefine the airlines labeling df2 = df.loc[:, ['AIRLINE', 'DEPARTURE_DELAY']] df2['AIRLINE'] = df2['AIRLINE'].replace(abbr_companies) # ________________________________________________________________________ colors = [ 'royalblue', 'grey', 'wheat', 'c', 'firebrick', 'seagreen', 'lightskyblue', 'lightcoral', 'yellowgreen', 'gold', 'tomato', 'violet', 'aquamarine', 'chartreuse' ] # ___________________________________ fig = plt.figure(1, figsize=(16, 15)) gs = GridSpec(2, 1) ax1 = fig.add_subplot(gs[0, 0]) ax2 = fig.add_subplot(gs[1, 0]) labels = [s for s in global_stats.index] # ---------------------------------------- # Pie chart for mean delay at departure # ---------------------------------------- sizes = global_stats['mean'].values sizes = [max(s, 0) for s in sizes] explode = [ 0.0 if sizes[i] < 20000 else 0.01 for i in range(len(abbr_companies)) ] patches, texts, autotexts = ax1.pie( sizes, explode=explode, labels=labels, colors=colors, shadow=False, startangle=0, autopct=lambda p: '{:.0f}'.format(p * sum(sizes) / 100)) for i in range(len(abbr_companies)): texts[i].set_fontsize(14) ax1.axis('equal') ax1.set_title('Mean delay at origin', bbox={ 'facecolor': 'midnightblue', 'pad': 5 }, color='w', fontsize=18) # ------------------------------------------------------ # striplot with all the values for the delays # ___________________________________________________________________ # Defining the colors for correspondance with the pie charts colors = [ 'firebrick', 'gold', 'lightcoral', 'aquamarine', 'c', 'yellowgreen', 'grey', 'seagreen', 'tomato', 'violet', 'wheat', 'chartreuse', 'lightskyblue', 'royalblue' ] # ___________________________________________________________________ ax2 = sns.stripplot(y="AIRLINE", x="DEPARTURE_DELAY", size=4, palette=colors, data=df2, linewidth=0.5, jitter=True) plt.setp(ax2.get_xticklabels(), fontsize=14) plt.setp(ax2.get_yticklabels(), fontsize=14) ax2.set_xticklabels([ '{:2.0f}h{:2.0f}m'.format(*[int(y) for y in divmod(x, 60)]) for x in ax2.get_xticks() ]) plt.xlabel('Departure delay', fontsize=18, bbox={ 'facecolor': 'midnightblue', 'pad': 5 }, color='w', labelpad=20) ax2.yaxis.label.set_visible(False) # ________________________ plt.tight_layout(w_pad=3) # If we Exclude Hawaiian Airlines and Alaska Airlines, which have low mean delays, the mean delay would be 11 ± 7 minutes # The second graph shows that, incase of mean delay being 11 minutes, there might be hours delay for some flights # In[16]: # # 2.1 Graphs # Function defining how delays are grouped delay_type = lambda x: ((0, 1)[x > 5], 2)[x > 45] df['DELAY_LEVEL'] = df['DEPARTURE_DELAY'].apply(delay_type) fig = plt.figure(1, figsize=(10, 7)) ax = sns.countplot(y="AIRLINE", hue='DELAY_LEVEL', data=df) # We replace the abbreviations by the full names of the companies and set the labels labels = [abbr_companies[item.get_text()] for item in ax.get_yticklabels()] ax.set_yticklabels(labels) plt.setp(ax.get_xticklabels(), fontsize=12, weight='normal', rotation=0) plt.setp(ax.get_yticklabels(), fontsize=12, weight='bold', rotation=0) ax.yaxis.label.set_visible(False) plt.xlabel('Flight count', fontsize=16, weight='bold', labelpad=10) # Set the legend L = plt.legend() L.get_texts()[0].set_text('on time (t < 5 min)') L.get_texts()[1].set_text('small delay (5 < t < 45 min)') L.get_texts()[2].set_text('large delay (t > 45 min)') plt.show() # %% # # 2.2 Impact of Departure Vs Arrival Delays mpl.rcParams.update(mpl.rcParamsDefault) mpl.rcParams['hatch.linewidth'] = 2.0 fig = plt.figure(1, figsize=(11, 6)) ax = sns.barplot(x="DEPARTURE_DELAY", y="AIRLINE", data=df, color="lightskyblue", ci=None) ax = sns.barplot(x="ARRIVAL_DELAY", y="AIRLINE", data=df, color="r", hatch='///', alpha=0.0, ci=None) labels = [abbr_companies[item.get_text()] for item in ax.get_yticklabels()] ax.set_yticklabels(labels) ax.yaxis.label.set_visible(False) plt.xlabel('Mean delay [min] (@departure: blue, @arrival: hatch lines)', fontsize=14, weight='bold', labelpad=10) # This figure shows arrival delays are lower than departure delays. # The arrival delays can be compensated during air travel. # So for this project we have estimating the departure delays. # %% # ### 2.2 Vizualization for delays at origin airports airport_mean_delays = pd.DataFrame(pd.Series( df['ORIGIN_AIRPORT'].unique())) airport_mean_delays.set_index(0, drop=True, inplace=True) for carrier in abbr_companies.keys(): df1 = df[df['AIRLINE'] == carrier] test = df1['DEPARTURE_DELAY'].groupby( df['ORIGIN_AIRPORT']).apply(get_stats).unstack() airport_mean_delays[carrier] = test.loc[:, 'mean'] temp_airports = airports identify_airport = temp_airports.set_index('IATA_CODE')['CITY'].to_dict() sns.set(context="paper") fig = plt.figure(1, figsize=(8, 8)) ax = fig.add_subplot(1, 2, 1) subset = airport_mean_delays.iloc[:50, :].rename(columns=abbr_companies) subset = subset.rename(index=identify_airport) mask = subset.isnull() sns.heatmap(subset, linewidths=0.01, cmap="Accent", mask=mask, vmin=0, vmax=35) plt.setp(ax.get_xticklabels(), fontsize=10, rotation=85) ax.yaxis.label.set_visible(False) ax = fig.add_subplot(1, 2, 2) subset = airport_mean_delays.iloc[50:100, :].rename(columns=abbr_companies) subset = subset.rename(index=identify_airport) fig.text(0.5, 1.02, "Delays: impact of the origin airport", ha='center', fontsize=18) mask = subset.isnull() sns.heatmap(subset, linewidths=0.01, cmap="Accent", mask=mask, vmin=0, vmax=35) plt.setp(ax.get_xticklabels(), fontsize=10, rotation=85) ax.yaxis.label.set_visible(False) plt.tight_layout() # From the above graph, we deduce # American eagle has large delays # Delta airlines has delays less than 5 minutes # Few airports favour late departure,like Denver, Chicago # In[32]: # Common class for graphs class Figure_style(): # _________________________________________________________________ def __init__(self, size_x=11, size_y=5, nrows=1, ncols=1): sns.set_style("white") sns.set_context("notebook", font_scale=1.2, rc={"lines.linewidth": 2.5}) self.fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=( size_x, size_y, )) # ________________________________ # convert self.axs to 2D array if nrows == 1 and ncols == 1: self.axs = np.reshape(axs, (1, -1)) elif nrows == 1: self.axs = np.reshape(axs, (1, -1)) elif ncols == 1: self.axs = np.reshape(axs, (-1, 1)) # _____________________________ def pos_update(self, ix, iy): self.ix, self.iy = ix, iy # _______________ def style(self): self.axs[self.ix, self.iy].spines['right'].set_visible(False) self.axs[self.ix, self.iy].spines['top'].set_visible(False) self.axs[self.ix, self.iy].yaxis.grid(color='lightgray', linestyle=':') self.axs[self.ix, self.iy].xaxis.grid(color='lightgray', linestyle=':') self.axs[self.ix, self.iy].tick_params(axis='both', which='major', labelsize=10, size=5) # ________________________________________ def draw_legend(self, location='upper right'): legend = self.axs[self.ix, self.iy].legend(loc=location, shadow=True, facecolor='g', frameon=True) legend.get_frame().set_facecolor('whitesmoke') # _________________________________________________________________________________ def cust_plot(self, x, y, color='b', linestyle='-', linewidth=1, marker=None, label=''): if marker: markerfacecolor, marker, markersize = marker[:] self.axs[self.ix, self.iy].plot(x, y, color=color, linestyle=linestyle, linewidth=linewidth, marker=marker, label=label, markerfacecolor=markerfacecolor, markersize=markersize) else: self.axs[self.ix, self.iy].plot(x, y, color=color, linestyle=linestyle, linewidth=linewidth, label=label) self.fig.autofmt_xdate() # ________________________________________________________________________ def cust_plot_date(self, x, y, color='lightblue', linestyle='-', linewidth=1, markeredge=False, label=''): markeredgewidth = 1 if markeredge else 0 self.axs[self.ix, self.iy].plot_date(x, y, color='lightblue', markeredgecolor='grey', markeredgewidth=markeredgewidth, label=label) # ________________________________________________________________________ def cust_scatter(self, x, y, color='lightblue', markeredge=False, label=''): markeredgewidth = 1 if markeredge else 0 self.axs[self.ix, self.iy].scatter(x, y, color=color, edgecolor='grey', linewidths=markeredgewidth, label=label) # def set_xlabel(self, label, fontsize=14): self.axs[self.ix, self.iy].set_xlabel(label, fontsize=fontsize) def set_ylabel(self, label, fontsize=14): self.axs[self.ix, self.iy].set_ylabel(label, fontsize=fontsize) # ____________________________________ def set_xlim(self, lim_inf, lim_sup): self.axs[self.ix, self.iy].set_xlim([lim_inf, lim_sup]) # ____________________________________ def set_ylim(self, lim_inf, lim_sup): self.axs[self.ix, self.iy].set_ylim([lim_inf, lim_sup]) # Sampling the data with 80:20 training and test data set df_train = df.sample(frac=0.8) df_test = df.loc[~df.index.isin(df_train.index)] df = df_train # In[37]: # Defining dataframe creation function ########################################################################### def get_flight_delays(df, carrier, id_airport, extrem_values=False): df2 = df[(df['AIRLINE'] == carrier) & (df['ORIGIN_AIRPORT'] == id_airport)] # _______________________________________ # remove extreme values before fitting if extrem_values: df2['DEPARTURE_DELAY'] = df2['DEPARTURE_DELAY'].apply( lambda x: x if x < 60 else np.nan) df2.dropna(how='any') # __________________________________ df2.sort_values('SCHEDULED_DEPARTURE', inplace=True) df2['schedule_depart'] = df2['SCHEDULED_DEPARTURE'].apply( lambda x: x.time()) # ___________________________________________________________________ test2 = df2['DEPARTURE_DELAY'].groupby( df2['schedule_depart']).apply(get_stats).unstack() test2.reset_index(inplace=True) # ___________________________________ fct = lambda x: x.hour * 60 + x.minute test2.reset_index(inplace=True) test2['schedule_depart_mnts'] = test2['schedule_depart'].apply(fct) return test2 def create_df(df, carrier, id_airport, extrem_values=False): df2 = df[(df['AIRLINE'] == carrier) & (df['ORIGIN_AIRPORT'] == id_airport)] df2.dropna(how='any', inplace=True) df2['weekday'] = df2['SCHEDULED_DEPARTURE'].apply( lambda x: x.weekday()) # ____________________ # delete delays > 1h df2['DEPARTURE_DELAY'] = df2['DEPARTURE_DELAY'].apply( lambda x: x if x < 60 else np.nan) df2.dropna(how='any', inplace=True) # _________________ # formating times fct = lambda x: x.hour * 60 + x.minute df2['schedule_depart'] = df2['SCHEDULED_DEPARTURE'].apply( lambda x: x.time()) df2['schedule_depart_mnts'] = df2['schedule_depart'].apply(fct) df2['schedule_arrivee'] = df2['SCHEDULED_ARRIVAL'].apply(fct) df3 = df2.groupby(['schedule_depart_mnts', 'schedule_arrivee'], as_index=False).mean() return df3 # # In[39]: # Linear Regression ####### Linear_Train ####### test2 = get_flight_delays(df, carrier, id_airport, False) test2.to_csv('Model_dataset.csv', sep=',') test = test2[['mean', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_L_train = np.array(test['schedule_depart_mnts']) Y_L_train = np.array(test['mean']) X_L_train = X_L_train.reshape(len(X_L_train), 1) Y_L_train = Y_L_train.reshape(len(Y_L_train), 1) regr = linear_model.LinearRegression() regr.fit(X_L_train, Y_L_train) result_L_train = regr.predict(X_L_train) score_L_train = regr.score(X_L_train, Y_L_train) # print("R^2 for Linear Train= ",score_L_train) print("MSE Linear Train=", metrics.mean_squared_error(result_L_train, Y_L_train)) # The coefficient R^2 is defined as (1 - u/v), where u is the residual sum of squares # ((y_true - y_pred) ** 2).sum() and v is the # total sum of squares ((y_true - y_true.mean()) ** 2).sum(). ####### Linear_Test ####### test2 = get_flight_delays(df_test, carrier, id_airport, False) test = test2[['mean', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_L_test = np.array(test['schedule_depart_mnts']) Y_L_test = np.array(test['mean']) X_L_test = X_L_test.reshape(len(X_L_test), 1) Y_L_test = Y_L_test.reshape(len(Y_L_test), 1) result_L_test = regr.predict(X_L_test) score_L_test = regr.score(X_L_test, Y_L_test) # print("R^2 for Linear Test= ",score_L_test) print("MSE Linear Test=", metrics.mean_squared_error(result_L_test, Y_L_test)) fig1 = Figure_style(8, 4, 1, 1) fig1.pos_update(0, 0) # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True) fig1.cust_plot(X_L_test, Y_L_test, color='b', linestyle=':', linewidth=2, marker=('b', 's', 10)) fig1.cust_plot(X_L_test, result_L_test, color='g', linewidth=3) fig1.style() fig1.set_ylabel('Delay (minutes)', fontsize=14) fig1.set_xlabel('Departure time', fontsize=14) # ____________________________________ # convert and set the x ticks labels fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60)) fig1.axs[fig1.ix, fig1.iy].set_xticklabels([ '{:2.0f}h{:2.0f}m'.format(*fct_convert(x)) for x in fig1.axs[fig1.ix, fig1.iy].get_xticks() ]) # In[77]: # Ridge Regression ####### Ridge_Training ####### df3 = get_flight_delays(df, carrier, id_airport) df3[:5] # df1 = df[(df['AIRLINE'] == carrier) & (df['ORIGIN_AIRPORT'] == id_airport)] # df1['heure_depart'] = df1['SCHEDULED_DEPARTURE'].apply(lambda x:x.time()) # df1['heure_depart'] = df1['heure_depart'].apply(lambda x:x.hour*60+x.minute) df3 = df3[['mean', 'schedule_depart_mnts']].dropna(how='any', axis=0) X = np.array(df3['schedule_depart_mnts']) Y = np.array(df3['mean']) X = X.reshape(len(X), 1) Y = Y.reshape(len(Y), 1) parameters = [0.2, 1] ridgereg = Ridge(alpha=parameters[0], normalize=True) poly = PolynomialFeatures(degree=parameters[1]) X_ = poly.fit_transform(X) ridgereg.fit(X_, Y) result_R_train = ridgereg.predict(X_) score_R_train = metrics.mean_squared_error(result_R_train, Y) r2_R_train = regr.score(X, Y) # print("R^2 for Ridge Train:",r2_R_train ) print('MSE Ridge Train= {}'.format(round(score_R_train, 2))) ####### Ridge_Test ####### df3 = get_flight_delays(df_test, carrier, id_airport) df3[:5] test = df3[['mean', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_L_test = np.array(test['schedule_depart_mnts']) Y_L_test = np.array(test['mean']) X_testt = X.reshape(len(X), 1) Y_testt = Y.reshape(len(Y), 1) X_ = poly.fit_transform(X_testt) result_test = ridgereg.predict(X_) score_R_test = metrics.mean_squared_error(result_test, Y_testt) r2_ridge_test = r2_score(X_testt, Y_testt) # print("R^2 for Ridge Test is: ",r2_ridge_test ) print('MSE Ridge Test = {}'.format(round(np.sqrt(score_R_test), 2))) # 'Ecart = {:.2f} min'.format(np.sqrt(score_R_test)) fig1 = Figure_style(8, 4, 1, 1) fig1.pos_update(0, 0) # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True) fig1.cust_plot(X_testt, Y_testt, color='b', linestyle=':', linewidth=2, marker=('b', 's', 10)) fig1.cust_plot(X_testt, result_test, color='g', linewidth=3) fig1.style() fig1.set_ylabel('Delay (minutes)', fontsize=14) fig1.set_xlabel('Departure time', fontsize=14) # ____________________________________ # convert and set the x ticks labels fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60)) fig1.axs[fig1.ix, fig1.iy].set_xticklabels([ '{:2.0f}h{:2.0f}m'.format(*fct_convert(x)) for x in fig1.axs[fig1.ix, fig1.iy].get_xticks() ]) # %% ########################################################################### ####### Random Forest_Train ####### df4 = create_df(df, carrier, id_airport) # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']]) # X_rf_Train = np.hstack((X_rf_Train)) df4 = df4[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_rf_Train = np.array(df4['schedule_depart_mnts']) Y_rf_Train = np.array(df4['DEPARTURE_DELAY']) X_rf_Train = X_rf_Train.reshape(len(X_rf_Train), 1) Y_rf_Train = Y_rf_Train.reshape(len(Y_rf_Train), 1) rf = RandomForestRegressor(n_estimators=100, oob_score=True, random_state=123456) rf.fit(X_rf_Train, Y_rf_Train) predicted_train = rf.predict(X_rf_Train) test_score = r2_score(Y_rf_Train, predicted_train) spearman = spearmanr(Y_rf_Train, predicted_train) # pearson = pearsonr(Y_rf_Train, predicted_train) # print(f'Out-of-bag R-2 score estimate: {rf.oob_score_:>5.3}') # print(f'Test data R-2 score: {test_score:>5.3}') # print(f'Test data Spearman correlation: {spearman[0]:.3}') # print("R^2 for RF Train:",test_score ) print('MSE RF Train= {}'.format( round(metrics.mean_squared_error(predicted_train, Y_rf_Train), 2))) # print(f'Test data Pearson correlation: {pearson[0]:.3}') ####### Random Forest_Test ####### df41 = create_df(df_test, carrier, id_airport) # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']]) # X_rf_Train = np.hstack((X_rf_Train)) df41 = df41[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_rf_Test = np.array(df41['schedule_depart_mnts']) Y_rf_Test = np.array(df41['DEPARTURE_DELAY']) X_rf_Test = X_rf_Test.reshape(len(X_rf_Test), 1) Y_rf_Test = Y_rf_Test.reshape(len(Y_rf_Test), 1) predicted_test = rf.predict(X_rf_Test) test_score = r2_score(Y_rf_Test, predicted_test) spearman = spearmanr(Y_rf_Test, predicted_test) # pearson = pearsonr(Y_rf_Train, predicted_train) # print(f'Out-of-bag R-2 score estimate: {rf.oob_score_:>5.3}') # print(f'Test data R-2 score: {test_score:>5.3}') # print(f'Test data Spearman correlation: {spearman[0]:.3}') score_rf_test = r2_score(X_rf_Test, Y_rf_Test) # print("R^2 for RF Test: ",score_rf_test ) score_RF_test = metrics.mean_squared_error(predicted_test, Y_rf_Test) print(' MSE RF Test = {}'.format(round(score_RF_test, 2))) fig1 = Figure_style(8, 4, 1, 1) fig1.pos_update(0, 0) # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True) fig1.cust_plot(X_rf_Test, Y_rf_Test, color='b', linestyle=':', linewidth=2, marker=('b', 's', 10)) fig1.cust_plot(X_rf_Test, predicted_test, color='g', linewidth=3) fig1.style() fig1.set_ylabel('Delay (minutes)', fontsize=14) fig1.set_xlabel('Departure time', fontsize=14) # ____________________________________ # convert and set the x ticks labels fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60)) fig1.axs[fig1.ix, fig1.iy].set_xticklabels([ '{:2.0f}h{:2.0f}m'.format(*fct_convert(x)) for x in fig1.axs[fig1.ix, fig1.iy].get_xticks() ]) # %% ########################################################################### ####### Neural Network_Train ####### df5 = create_df(df, carrier, id_airport) # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']]) # X_rf_Train = np.hstack((X_rf_Train)) df5 = df5[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_nn_Train = np.array(df5['schedule_depart_mnts']) Y_nn_Train = np.array(df5['DEPARTURE_DELAY']) X_nn_Train = X_nn_Train.reshape(len(X_nn_Train), 1) Y_nn_Train = Y_nn_Train.reshape(len(Y_nn_Train), 1) regr = LinearSVR(random_state=0) # from sknn.mlp import Classifier, Layer # #regr = LinearSVR(random_state=0) # regr = Classifier( # layers=[ # Layer("Rectifier", units=10), # Layer("Linear")], # learning_rate=0.02, # n_iter=5) regr.fit(X_nn_Train, Y_nn_Train) predict_train_NN = regr.predict(X_nn_Train) r2_NN_train = r2_score(Y_nn_Train, predict_train_NN) # print("R^2 for NN Train:",r2_NN_train ) print('MSE NN Train= {}'.format( round(metrics.mean_squared_error(predict_train_NN, Y_nn_Train), 2))) ####### Neural Network_Test ####### df51 = create_df(df_test, carrier, id_airport) # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']]) # X_rf_Train = np.hstack((X_rf_Train)) df51 = df51[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_NN_Test = np.array(df51['schedule_depart_mnts']) Y_NN_Test = np.array(df51['DEPARTURE_DELAY']) X_NN_Test = X_NN_Test.reshape(len(X_NN_Test), 1) Y_NN_Test = Y_NN_Test.reshape(len(Y_NN_Test), 1) predict_test_NN = regr.predict(X_NN_Test) score_NN_test = r2_score(X_NN_Test, Y_NN_Test) # print("R^2 for NN Test: ",score_NN_test ) MSE_NN_test = metrics.mean_squared_error(predict_test_NN, Y_NN_Test) print('MSE NN Test = {}'.format(round(MSE_NN_test, 2))) fig1 = Figure_style(8, 4, 1, 1) fig1.pos_update(0, 0) # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True) fig1.cust_plot(X_NN_Test, Y_NN_Test, color='b', linestyle=':', linewidth=2, marker=('b', 's', 10)) fig1.cust_plot(X_NN_Test, predict_test_NN, color='g', linewidth=3) fig1.style() fig1.set_ylabel('Delay (minutes)', fontsize=14) fig1.set_xlabel('Departure time', fontsize=14) # convert and set the x ticks labels fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60)) fig1.axs[fig1.ix, fig1.iy].set_xticklabels([ '{:2.0f}h{:2.0f}m'.format(*fct_convert(x)) for x in fig1.axs[fig1.ix, fig1.iy].get_xticks() ]) # %% ########################################################################### ####### SVM_Train ####### df6 = create_df(df, carrier, id_airport) df6 = df6[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_svm_Train = np.array(df6['schedule_depart_mnts']) Y_svm_Train = np.array(df6['DEPARTURE_DELAY']) X_svm_Train = X_svm_Train.reshape(len(X_svm_Train), 1) Y_svm_Train = Y_svm_Train.reshape(len(Y_svm_Train), 1) regr = SVR(kernel='linear') regr.fit(X_svm_Train, Y_svm_Train) predict_train_svm = regr.predict(X_svm_Train) r2_svm_train = r2_score(Y_nn_Train, predict_train_svm) # print("R^2 for svm Train:",r2_svm_train ) print('MSE svm Train= {}'.format( round(metrics.mean_squared_error(predict_train_svm, Y_svm_Train), 2))) ####### SVM_Test ####### df61 = create_df(df_test, carrier, id_airport) # X_rf_Train = np.array(df3[['schedule_depart','schedule_arrivee', 'ARRIVAL_DELAY', 'SCHEDULED_TIME','ELAPSED_TIME','weekday']]) # X_rf_Train = np.hstack((X_rf_Train)) df61 = df61[['DEPARTURE_DELAY', 'schedule_depart_mnts']].dropna(how='any', axis=0) X_svm_Test = np.array(df61['schedule_depart_mnts']) Y_svm_Test = np.array(df61['DEPARTURE_DELAY']) X_svm_Test = X_svm_Test.reshape(len(X_svm_Test), 1) Y_svm_Test = Y_svm_Test.reshape(len(Y_svm_Test), 1) predict_test_svm = regr.predict(X_svm_Test) r2_svm_test = r2_score(X_svm_Test, Y_svm_Test) # print("R^2 for svm Test: ",r2_svm_test ) mse_svm_test = metrics.mean_squared_error(predict_test_svm, Y_svm_Test) print('MSE svm Test= {}'.format(round(mse_svm_test, 2))) fig1 = Figure_style(8, 4, 1, 1) fig1.pos_update(0, 0) # fig1.cust_scatter(df1['heure_depart'], df1['DEPARTURE_DELAY'], markeredge = True) fig1.cust_plot(X_svm_Test, Y_svm_Test, color='b', linestyle=':', linewidth=2, marker=('b', 's', 10)) fig1.cust_plot(X_svm_Test, predict_test_svm, color='g', linewidth=3) fig1.style() fig1.set_ylabel('Delay (minutes)', fontsize=14) fig1.set_xlabel('Departure time', fontsize=14) # ____________________________________ # convert and set the x ticks labels fct_convert = lambda x: (int(x / 3600), int(divmod(x, 3600)[1] / 60)) fig1.axs[fig1.ix, fig1.iy].set_xticklabels([ '{:2.0f}h{:2.0f}m'.format(*fct_convert(x)) for x in fig1.axs[fig1.ix, fig1.iy].get_xticks() ]) return np.mean(result_L_test), np.mean(result_test), np.mean( predicted_test), np.mean(predict_test_NN), np.mean(predict_test_svm)
## Create K folds k_fold = KFold(Y_train_raw.shape[0], n_folds=10) for train, test in k_fold: X1 = X_train_reduced[train] Y1 = Y_train_raw[train] X2 = X_train_reduced[test] Y2 = Y_train_raw[test] ## Train Classifiers on fold rdg_clf = Ridge(alpha=0.5) rdg_clf.fit(X1, Y1) lso_clf = Lasso(alpha=0.6257) lso_clf.fit(X1, Y1) svr_clf = LinearSVR(C=1e3) svr_clf.fit(X1, Y1) ## Score Classifiers on fold rdg_clf_score = rdg_clf.score(X2, Y2) lso_clf_score = lso_clf.score(X2, Y2) svr_clf_score = svr_clf.score(X2, Y2) print "Ridge: ", rdg_clf_score print "Lasso: ", lso_clf_score print "SVR_RBF: ", svr_clf_score ## Train final Classifiers # clf = Ridge(alpha=.5) clf = LinearSVR(C=1e3, gamma=0.1) clf.fit(X_train_reduced, Y_train_raw)
def linear_svr(dataframe, target=None, drop_features=[], without_outliers=False, split=0.2): warnings.filterwarnings("ignore", category=ConvergenceWarning, message="^Liblinear failed to converge") # Remove non-numerical and undesired features from dataframe dataframe = dataframe.loc[:, dataframe.dtypes != 'object'] dataframe = dataframe.drop(drop_features, axis=1) # Transform data into columns and define target variable numerical_features = dataframe.loc[:, dataframe.columns != target] X = np.nan_to_num( numerical_features.to_numpy()) # .reshape(numerical_features.shape) y = np.nan_to_num(dataframe[target].to_numpy() ) # .reshape(dataframe[target].shape[0], 1) # Split the data into training/testing sets testsplit = round(split * X.shape[0]) X_train = X[:-testsplit] X_test = X[-testsplit:] y_train = y[:-testsplit] y_test = y[-testsplit:] # Train linear regression model reg = LinearSVR(random_state=0, tol=1e-5) reg.fit(X_train, y_train) feature_importance = pd.Series( reg.coef_[0], index=numerical_features.columns) # only with linear kernel # Prediction with trained model y_pred = reg.predict(X_test) results = pd.DataFrame() results['Train mean'] = np.mean(y_train) results['Train std'] = np.std(y_train) results['Test mean'] = np.mean(y_test) results['Test std'] = np.std(y_test) results['Prediction mean'] = np.mean(y_pred) results['Prediction std'] = np.std(y_pred) results['Mean Squared Error'] = mean_squared_error(y_test, y_pred) results['Mean Absolute Error'] = mean_absolute_error(y_test, y_pred) results['R2 score'] = r2_score(y_test, y_pred) results['Explained variance score'] = explained_variance_score( y_test, y_pred) results['Cross-val R2 score (mean)'] = np.mean( cross_val_score(reg, X, y, cv=10, scoring="r2")) results['Cross-val R2 scores'] = cross_val_score(reg, X, y, cv=10, scoring="r2") results['Cross-val explained_variance score (mean)'] = np.mean( cross_val_score(reg, X, y, cv=10, scoring="explained_variance")) results['Cross-val explained_variance scores'] = cross_val_score( reg, X, y, cv=10, scoring="explained_variance") y_result = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred}) return feature_importance, results, y_result, reg
class ESN_linear_svr_learner(): def __init__(self, n_readout=1000, n_components=100, damping=0.5, weight_scaling=0.9, discard_steps=0, random_state=None, epsilon=0.0, C=1.0, max_iter=1000): self.n_readout = n_readout self.n_components = n_components self.damping = damping self.weight_scaling = weight_scaling self.discard_steps = discard_steps self.random_state = random_state self.epsilon = epsilon self.C = C self.max_iter = max_iter self.ESN = SimpleESN(n_readout=self.n_readout, n_components=self.n_components, damping=self.damping, weight_scaling=self.weight_scaling, discard_steps=self.discard_steps, random_state=check_random_state( self.random_state)) self.Linear_SVR = LinearSVR(epsilon=self.epsilon, tol=1e-4, C=self.C, loss='epsilon_insensitive', fit_intercept=True, intercept_scaling=1., dual=True, verbose=0, random_state=None, max_iter=self.max_iter) def fit(self, X, y): self.ESN.fit(X) self.Linear_SVR.fit(self.ESN.transform(X), y) return self def predict(self, X): return self.Linear_SVR.predict(self.ESN.transform(X)) def get_params(self, deep=True): if deep: params = { 'n_readout': self.n_readout, 'n_components': self.n_components, 'damping': self.damping, 'weight_scaling': self.weight_scaling, 'discard_steps': self.discard_steps, 'random_state': self.random_state, 'epsilon': self.epsilon, 'C': self.C, 'max_iter': self.max_iter } return params else: params = { 'n_readout': self.n_readout, 'n_components': self.n_components, 'damping': self.damping, 'weight_scaling': self.weight_scaling } return params
train_X_Headline = hstack( [train_vect_2_hst, csr_matrix(train_headline.values)]) test_X_Headline = hstack([test_vect_2_hst, csr_matrix(test_headline.values)]) y2 = train['SentimentHeadline'] np.shape(train_X_Title) #model for sentiment title X_train, X_test, y_train, y_test = train_test_split(train_X_Title, y1, test_size=0.20, random_state=42) LSVR1 = LinearSVR(C=0.2) LSVR1.fit(X_train, y_train) y_pred1 = LSVR1.predict(X_test) mae1 = mean_absolute_error(y_pred1, y_test) print('MAE:', 1 - mae1) X_train, X_test, y_train, y_test = train_test_split(train_X_Headline, y2, test_size=0.20, random_state=42) LSVR2 = LinearSVR(C=0.1) LSVR2.fit(X_train, y_train) y_pred2 = LSVR2.predict(X_test) mae2 = mean_absolute_error(y_pred2, y_test)
# # Regression # SVM algorithm can also be used for regression - instead of finding the street # with the fewest instance violations, it tries to find the street with the # most instance violations # Let's generate some linearly random data np.random.seed(42) m = 50 X = 2 * np.random.rand(m, 1) y = (4 + 3 * X + np.random.randn(m, 1)).ravel() # Train an SVR algorithm from sklearn.svm import LinearSVR svm_reg = LinearSVR(epsilon=1.5, random_state=42) svm_reg.fit(X, y) svm_reg1 = LinearSVR(epsilon=1.5, random_state=42) svm_reg2 = LinearSVR(epsilon=0.5, random_state=42) svm_reg1.fit(X, y) svm_reg2.fit(X, y) def find_support_vectors(svm_reg, X, y): y_pred = svm_reg.predict(X) off_margin = (np.abs(y - y_pred) >= svm_reg.epsilon) return np.argwhere(off_margin) svm_reg1.support_ = find_support_vectors(svm_reg1, X, y) svm_reg2.support_ = find_support_vectors(svm_reg2, X, y)
# Exercise 10 P166 # data set housing = fetch_california_housing() X = housing["data"] y = housing["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # scale scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # build model lin_svr = LinearSVR(random_state=42) lin_svr.fit(X_train_scaled, y_train) y_pred = lin_svr.predict(X_train_scaled) mse = mean_squared_error(y_train, y_pred) print('LinearSVR MSE: ', mse) # 0.949968822217229 not good print('LinearSVR RMSE: ', np.sqrt(mse)) # grid search the best estimator with SVR() model which can use kernel skill param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)} rnd_search_cv = RandomizedSearchCV(SVR(), param_distributions, n_iter=10, verbose=2, random_state=42) rnd_search_cv.fit(X_train_scaled, y_train) print('best estimator: ', rnd_search_cv.best_estimator_) '''SVR(C=4.745401188473625, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.07969454818643928, kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
# Impute missing values imputer = SimpleImputer(strategy="mean") imputer.fit(X_train) X_train = imputer.transform(X_train) X_test = imputer.transform(X_test) print("Normalizing...") # Normalize feature values using MinMaxScaler scaler = MinMaxScaler(feature_range=(-1, 1)) scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) print("Training...") if algorithm == "svr": reg = LinearSVR(C=10, random_state=42, verbose=1, max_iter=10000) elif algorithm == "tree": reg = DecisionTreeRegressor(random_state=42, criterion="mse") elif algorithm == "knn": reg = KNeighborsRegressor(n_neighbors=5) elif algorithm == "forest": reg = RandomForestRegressor(n_estimators=100, criterion="mse", n_jobs=12) reg.fit(X_train, y_train) y_pred = reg.predict(X_test) print("MAE: ", mean_absolute_error(y_test, y_pred)) print("MAPE: ", mean_absolute_percentage_error(y_test, y_pred)) print("RMSE: ", mean_squared_error(y_test, y_pred, squared=False))
# plot_predictions(svm_clf, [-1.5, 2.5, -1, -1.5]) # plot_dataset(X, y, [-1.5, 2.5, -1, 1.5]) # gamma, C = hyperparams[i] # plt.title(r"$\gamma = {}, C={}$".format(gamma, C), fontsize=16) # plt.show() rnd.seed(42) m = 50 X = 2 * rnd.rand(m, 1) y = (4 + 3 * X + rnd.randn(m, 1)).ravel() svm_reg1 = LinearSVR(epsilon=1.5) svm_reg2 =LinearSVR(epsilon=0.5) svm_reg1.fit(X, y) svm_reg2.fit(X, y) def find_support_vectors(svm_reg, X, y): y_pred = svm_reg.predict(X) off_margin = (np.abs(y - y_pred) >= svm_reg.epsilon) return np.argwhere(off_margin) svm_reg1.support_ = find_support_vectors(svm_reg1, X, y) svm_reg2.support_ = find_support_vectors(svm_reg2, X, y) eps_x1 = 1 eps_y_pred = svm_reg1.predict([[eps_x1]])
ps = PredefinedSplit(NO2_test_fold) param_grid = { # "kernel": ["rbf"], "C": [0.1, 1, 10], "epsilon": [0.1] } svr = LinearSVR(C=10) gs = GridSearchCV(svr, param_grid, scoring="neg_mean_squared_error", n_jobs=1, iid=False, refit=True, cv=ps) svr.fit(preprocessing.normalize(X_NO2), Y_NO2) gs.fit(preprocessing.normalize(X_NO2), Y_NO2) evaluate_mse(svr, preprocessing.normalize(NO2_train_f), preprocessing.normalize(NO2_dev_f), Y_NO2_train, Y_NO2_dev) # SGD Regressor ps = PredefinedSplit(NO2_test_fold) param_grid = { "loss": ["squared_loss", "huber"], # "penalty": ["l2", "l1"], "penalty": ["l2", "l1"], "alpha": [0.0001, 0.001, 1, 10], "shuffle": [True, False], "n_iter": [10] }
cat_vars = ['DayOfWeek','Promo','StateHoliday','SchoolHoliday','StoreType','Assortment','CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear','Promo2','Promo2SinceWeek','Promo2SinceYear','PromoInterval','Day','Month','Year'] num_vars = ['Open','Store','CompetitionDistance','ratio1','ratio2'] X_trn, X_val = train_test_split(train, test_size=0.012, random_state=10) print 'Training Stage 1 Models' #train svm svm1 = LinearSVR(verbose=True) svm1.fit(X_trn[cat_vars+num_vars],X_trn['Sales']) svm1_feature = svm1.predict(train[cat_vars+num_vars]) preds = svm1.predict(X_val[cat_vars+num_vars]) print 'svm ',(np.mean(((np.exp(preds)-np.exp(X_val['Sales']))/(np.exp(X_val['Sales'])+1))**2))**0.5 #train xgb dtrain = xgb.DMatrix(X_trn[cat_vars+num_vars],X_trn['Sales']) dvalid = xgb.DMatrix(X_val[cat_vars+num_vars],X_val['Sales']) watchlist = [(dtrain, 'train'), (dvalid, 'eval')] num_boost_round = 50 params1 = {"objective": "reg:linear","booster" : "gbtree", "eta": 0.5,"max_depth": 2,"subsample": 0.5,"colsample_bytree": 0.4, "nthread":4,"silent": 1,"seed": 1301} gbm1 = xgb.train(params1, dtrain, num_boost_round, evals=watchlist,early_stopping_rounds=50, feval=rmspe_xg, verbose_eval=True)
if __name__ == '__main__': # NOTE: Make sure that the outcome column is labeled 'target' in the data file url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/auto-insurance.csv' dataframe = pd.read_csv(url, header=None) # split into input and output elements data = dataframe.values data = data.astype('float32') X, y = data[:, :-1], data[:, -1] training_features, testing_features, training_target, testing_target = \ train_test_split(X, y, random_state=1) # Average CV score on the training set was: -29.116294532472594 exported_pipeline = LinearSVR(C=15.0, dual=False, epsilon=0.0001, loss="squared_epsilon_insensitive", tol=0.001) # Fix random state in exported estimator if hasattr(exported_pipeline, 'random_state'): setattr(exported_pipeline, 'random_state', 1) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) # make a prediction on a new row of data row = [108] yhat = exported_pipeline.predict([row]) print('Predicted: %.3f' % yhat[0])
combined = np.append(X, np.matrix(Y).T, axis=1) np.random.shuffle(combined) tail_size = -1 * size last_column = X.shape[1] training_labels = combined[:tail_size, last_column] training_data = combined[:tail_size, :-2] test_data = combined[tail_size:, :-2] actual_labels = combined[tail_size:, last_column] return training_data, np.ravel(training_labels), test_data, np.ravel(actual_labels) training = open('author_features') NO_TRAINING_SAMPLES = 6000 NO_OF_AUTHORS = 10000 matrix = dok_matrix((NO_TRAINING_SAMPLES, NO_OF_AUTHORS), dtype=np.int) for line in training.readlines(): values = line.rstrip().split() matrix[int(values[0]), int(values[1])] = 1 labels_file = open('year_training_labels') labels = [int(x) for x in labels_file.readline().rstrip().split()] training_matrix = matrix[:4498] training_data, training_labels, test_data, actual_labels = sample(training_matrix, labels) classifier = LinearSVR() classifier.fit(training_data, training_labels) output = classifier.predict(test_data) for index, predicted in enumerate(output): print '%s %s' % (predicted, actual_labels[index]) print metrics.explained_variance_score(actual_labels, output)
(x_input, y_input) = get_training_data(feature_lin_lambda=feature_lin_lambda, feature_lin_var=feature_lin_var, data_exp=data_exp) # 对属性进行归一化 x_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1)) ### y 需不需要进行归一化?没有归一化的理由,但影响结果!!! # y_scaler = preprocessing.MinMaxScaler(feature_range=(-1,1)) # x_input_minmax = x_scaler.fit_transform(x_input) # y_input_minmax = y_scaler.fit_transform(y_input.reshape(-1,1)) # y_input_minmax = y_input_minmax.reshape((len(y_input_minmax))) # 通过交叉验证来选择C best_cv_score = -1e+30; for log2c in np.arange(-10,30,1): clf = LinearSVR(C=2**log2c, epsilon=0.0001) clf.fit(x_input_minmax, y_input) cv_score = cross_val_score(cv=sample_num, estimator=clf, X=x_input_minmax, y=y_input, scoring= 'mean_squared_error').mean() # 留1 print(cv_score) if cv_score > best_cv_score: best_cv_score = cv_score bestc = 2**log2c # 利用所选的参数进行预测 clf = LinearSVR(C=bestc, epsilon=0.0001) clf.fit(x_input_minmax, y_input) y_pred = clf.predict(x_input_minmax) # y_pred = y_scaler.inverse_transform(y_pred.reshape(-1,1)) view_point = 5; plt.plot(x_input[:,view_point], y_input, 'bo-', x_input[:,view_point], y_pred, 'rs-')
train_cluster = pd.DataFrame(columns=('x1', 'x2', 'x3', 'x4', 'x5', 'y')) for i in range(0,(len(cluster_i) - 5)): train_cluster.loc[i] = [cluster_i.iloc[i], cluster_i.iloc[i+1], cluster_i.iloc[i+2], cluster_i.iloc[i+3], cluster_i.iloc[i+4], cluster_i.iloc[i+5]] explanatory_features = [col for col in train_cluster.columns if col not in ['y']] explanatory_df = np.array(train_cluster[explanatory_features]) response_series = np.array(train_cluster.y) ### SUPPORT VECTOR REGRESSION MODEL linsvr = LinearSVR(epsilon=0.1, tol=1e-4, C=1.0, loss='squared_epsilon_insensitive') linsvr.fit(explanatory_df, response_series) linsvr_rsq[c] = svr.score(explanatory_df, response_series) # prediction and linear extrapolation of training data set to get further predictions. test_cluster = train_cluster.copy() explanatory_testdf = test_cluster[explanatory_features] response_testseries = test_cluster.y for i in range(0,(len(cluster_i) - 5)): test_cluster.loc[i] = [cluster_i.iloc[i], cluster_i.iloc[i+1], cluster_i.iloc[i+2], cluster_i.iloc[i+3], cluster_i.iloc[i+4], linsvr.predict(explanatory_df)[i]] # further running time series to predict into the future
class LinearSvrClass: """ Name : LinearSVR Attribute : None Method : predict, predict_by_cv, save_model """ def __init__(self): # 알고리즘 이름 self._name = 'linearsvr' # 기본 경로 self._f_path = os.path.abspath( os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)) # 경고 메시지 삭제 warnings.filterwarnings('ignore') # 원본 데이터 로드 data = pd.read_csv(self._f_path + "/regression/resource/regression_sample.csv", sep=",", encoding="utf-8") # 학습 및 테스트 데이터 분리 self._x = (data["year"] <= 2017) self._y = (data["year"] >= 2018) # 학습 데이터 분리 self._x_train, self._y_train = self.preprocessing(data[self._x]) # 테스트 데이터 분리 self._x_test, self._y_test = self.preprocessing(data[self._y]) # 모델 선언 self._model = LinearSVR(max_iter=500, tol=1e-5) # 모델 학습 self._model.fit(self._x_train, self._y_train) # 데이터 전처리 def preprocessing(self, data): # 학습 x = [] # 레이블 y = [] # 기준점(7일) base_interval = 7 # 기온 temps = list(data["temperature"]) for i in range(len(temps)): if i < base_interval: continue y.append(temps[i]) xa = [] for p in range(base_interval): d = i + p - base_interval xa.append(temps[d]) x.append(xa) return x, y # 일반 예측 def predict(self, save_img=False, show_chart=False): # 예측 y_pred = self._model.predict(self._x_test) # 스코어 정보 score = r2_score(self._y_test, y_pred) # 리포트 확인 if hasattr(self._model, 'coef_') and hasattr(self._model, 'intercept_'): print(f'Coef = {self._model.coef_}') print(f'intercept = {self._model.intercept_}') print(f'Score = {score}') # 이미지 저장 여부 if save_img: self.save_chart_image(y_pred, show_chart) # 예측 값 & 스코어 return [list(y_pred), score] # CV 예측(Cross Validation) def predict_by_cv(self): # Regression 알고리즘은 실 프로젝트 상황에 맞게 Cross Validation 구현 return False # GridSearchCV 예측 def predict_by_gs(self): pass # 모델 저장 및 갱신 def save_model(self, renew=False): # 모델 저장 if not renew: # 처음 저장 joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl') else: # 기존 모델 대체 if os.path.isfile(self._f_path + f'/model/{self._name}_rg.pkl'): os.rename( self._f_path + f'/model/{self._name}_rg.pkl', self._f_path + f'/model/{str(self._name) + str(time.time())}_rg.pkl') joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl') # 회귀 차트 저장 def save_chart_image(self, data, show_chart): # 사이즈 plt.figure(figsize=(15, 10), dpi=100) # 레이블 plt.plot(self._y_test, c='r') # 예측 값 plt.plot(data, c='b') # 이미지로 저장 plt.savefig('./chart_images/tenki-kion-lr.png') # 차트 확인(Optional) if show_chart: plt.show() def __del__(self): del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
def linearSVR(train,trainLable,testData): clf = LinearSVR() clf.fit(train,trainLable) predict = clf.predict(testData) return predict
md=dnn_reg(X_train,y_train,X_test,y_test) reg_eval(X_test,y_test,md) ###Lasso CV regression def reg_eval2(y_test,model): y_pred=model.predict(X_test) print("evaluation the results for model:",model) print("MSE:",mean_squared_error(y_test,y_pred)) print("R2:",r2_score(y_test,y_pred)) print("EVS:",explained_variance_score(y_test,y_pred)) lasso = LassoCV(cv=5, random_state=0,max_iter=10000) lasso.fit(X_train,y_train) reg_eval2(y_test,lasso) #ElasticNet Regressionb ela = ElasticNetCV(l1_ratio=0.8,normalize=True,max_iter=5000,random_state=77) ela.fit(X_train,y_train) print("R square:",ela.score(X_test,y_test)) reg_eval2(y_test,ela) #SVR Regression from sklearn.svm import LinearSVR LSVR=LinearSVR(epsilon=0.1,random_state=0, tol=1e-5,max_iter=10000) # scaler=RobustScaler() # pipe=Pipeline(steps=[("scaling",scaler),("rg",LSVR)]) LSVR.fit(X_train,y_train) reg_eval2(y_test,LSVR))
trainingSet = train[~idx] validationSet = train[idx] tr_X = np.matrix(trainingSet[feature_names]) tr_Y = np.array(trainingSet["Response"]) val_X = np.matrix(validationSet[feature_names]) val_Y = np.array(validationSet["Response"]) regm = LinearSVR(C=0.06, epsilon=0.45, tol=1e-5, dual=True, verbose=True, random_state=133) regm.fit(tr_X, tr_Y) preds = regm.predict(val_X) df = pd.DataFrame( dict({ "Id": validationSet["Id"], "ground_truth": validationSet["Response"], "linsvr_preds": preds })) linsvr_val = linsvr_val.append(df, ignore_index=True) tpreds = regm.predict(test_X) cname = "Fold" + ` i ` linsvr_test[cname] = tpreds
# Tuning models and test for all features # Linear Regression linreg = LinearRegression() linreg.fit(X_train, y_train) acc_model(0,linreg,X_train,X_test) print("Done") # Support Vector Machines svr = SVR() svr.fit(X_train, y_train) acc_model(1,svr,X_train,X_test) print("Done") # Linear SVR linear_svr = LinearSVR() linear_svr.fit(X_train, y_train) acc_model(2,linear_svr,X_train,X_test) print("Done") # MLPRegressor mlp = MLPRegressor() param_grid = {'hidden_layer_sizes': [i for i in range(2,20)], 'activation': ['relu'], 'solver': ['adam'], 'learning_rate': ['constant'], 'learning_rate_init': [0.01], 'power_t': [0.5], 'alpha': [0.0001], 'max_iter': [1000], 'early_stopping': [True], 'warm_start': [False]}
class SVMTextEncoder(BaseEstimator, TransformerMixin): # number of jobs to execute in parallel NUM_JOBS = 3 # number of folds to apply to svm fit NUM_FOLDS = 3 # !! add tuning def __init__(self, metric, random_seed): super().__init__() self._vect = TfidfVectorizer(ngram_range=[1, 2], max_features=30000) self._random_seed = random_seed if metric in classification_metrics: self._model = LinearSVC(class_weight="balanced", random_state=random_seed) self.mode = "classification" elif metric in regression_metrics: self._model = LinearSVR(random_state=random_seed) self.mode = "regression" else: raise AttributeError( "metric not in classification or regression metrics") def fit(self, X, y): raise NotImplemented def transform(self, X): X = pd.Series(X.squeeze()).fillna(MISSING_VALUE_INDICATOR).values Xv = self._vect.transform(X) if self.mode == "classification": out = self._model.decision_function(Xv) else: out = self._model.predict(Xv) if len(out.shape) == 1: out = out.reshape(-1, 1) return out def fit_transform(self, X, y=None, **kwargs): assert y is not None, "SVMTextEncoder.fit_transform requires y" X = pd.Series(X.squeeze()).fillna(MISSING_VALUE_INDICATOR).values Xv = self._vect.fit_transform(X) self._model = self._model.fit(Xv, y) if self.mode == "classification": # Aim for NUM_FOLDS and stratified k-fold. If that doesn't work, fallback to uniform sampling. num_folds = min(self.NUM_FOLDS, y.value_counts().min()) if num_folds < 2: cv = KFold(n_splits=self.NUM_FOLDS, random_state=self._random_seed) out = cross_val_predict( self._model, Xv, y, method="decision_function", n_jobs=self.NUM_JOBS, cv=cv, ) else: out = cross_val_predict( self._model, Xv, y, method="decision_function", n_jobs=self.NUM_JOBS, cv=num_folds, ) else: out = cross_val_predict(self._model, Xv, y, n_jobs=self.NUM_JOBS, cv=self.NUM_FOLDS) if len(out.shape) == 1: out = out.reshape(-1, 1) return out