class SVR(PlayerModel): ### a wrapper for support vector regression using scikit-learn for this project def __init__(self): PlayerModel.__init__(self) # configure support vector regression and start training self.regr = SupportVectorRegression(kernel = 'linear', C = 1000) self.regr.fit(self.dataset_X_train, self.dataset_Y_train) print "Finish building player model." print "Parameters: ", self.regr.get_params() print "============================================================" def testScore(self, test_X): score = self.regr.predict(self.normalizeTest(test_X)) return np.mean(score) def getParams(self): return self.regr.get_params() def visualize(self): x = np.zeros((10, self.col - 1)) mean = self.dataset_X_train.mean(0) for i in range(10): x[i, :] = mean x[:, 0:1] = np.array([np.arange(0.0, 1.1, 0.11)]).T # print x y = self.regr.predict(x) # print y pyplot.scatter(self.dataset_X_train[:, 0:1], self.dataset_Y_train, c='k', label='data') pyplot.hold('on') pyplot.plot(x[:, 0:1], y, c = "r", label='Support Vector Regression') pyplot.xlabel('data collect from player') pyplot.ylabel('score') pyplot.title('Support Vector Regression') pyplot.legend() pyplot.show()
def SVRegresion(self): n = len(self.training_u) # the number of data self.training_u = self.training_u.reshape((n, 1)) # the requirement of SVR self.training_x = self.training_x.reshape((n,)) self.training_y = self.training_y.reshape((n,)) self.vali_u = self.vali_u.reshape((len(self.vali_u), 1)) svr_rbf = SVR(kernel='rbf', C=self.C, gamma=self.gamma, epsilon=self.epsilon) # https://en.wikipedia.org/wiki/Radial_basis_function self.predict_x = svr_rbf.fit(self.training_u, self.training_x).predict(self.vali_u) self.predict_y = svr_rbf.fit(self.training_u, self.training_y).predict(self.vali_u) print svr_rbf.get_params(deep=True) return self.predict_x, self.predict_y
def SVRegresion(X, Y, X_vali, C, gamma, epsilon): n = len(X) #the number of data X = X.reshape((n, 1)) # the requirement of SVR Y = Y.reshape((n, )) X_vali = X_vali.reshape((len(X_vali), 1)) #mean = sum(X*Y)/n #note this correction #sigma = sum(Y*(X-mean)**2)/n print np.shape(X), np.shape(Y) svr_rbf = SVR( kernel='rbf', C=C, gamma=gamma, epsilon=epsilon) #https://en.wikipedia.org/wiki/Radial_basis_function Y_rbf = svr_rbf.fit(X, Y).predict(X_vali) print svr_rbf.get_params(deep=True) #print Y_rbf return Y_rbf
def svr(): if request.method == 'POST': #load data x = np.load("x_storage.npy") y = np.load("y_storage.npy") time_set = np.load("time_storage.npy") #regression svr_rbf = SVR(kernel='rbf', C=float(request.form['C']), gamma=float(request.form['gamma'])) svr_rbf.fit(x, y) y_rbf = svr_rbf.predict(x) #draw picture plt.scatter(time_set, y, color='darkorange', label='data') plt.plot(time_set, y_rbf, color='navy', lw=2, label='RBF model') plt.xlabel('date') plt.ylabel('GDP') plt.title('Support Vector Regression') handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) plt.legend(by_label.values(), by_label.keys()) ax = plt.axes() ax.xaxis.set_major_locator(plt.MaxNLocator(9)) plt.savefig('static/result.png') plt.close() return render_template('result.html', url='static/result.png', co=svr_rbf.score(x, y), pa=svr_rbf.get_params(), C_default=request.form['C'], gamma_default=request.form['gamma'])
class SVRForecaster(AbstractForecaster): params_grid = { "epsilon": list(range(0, 20, 2)), "gamma": ['auto'] + list(range(1, 100, 10)) } def __init__(self, epsilon=5, gamma='auto'): self.svr = SVR(epsilon=epsilon, gamma=gamma) def _decision_function(self, X): return self.svr.predict(X) def _train(self, data): X, y = data.all_train_data() self.svr.fit(X, y) def _build(self): pass def score(self, X, y): prediction = self.predict(X) return np.sqrt(np.mean(np.square((prediction - y + 0.1) / (y + 0.1)))) def save(self): file_name = self.__class__.__name__ + datetime.datetime.now().strftime( "%Y-%m-%d-%H:%M") params = self.svr.get_params() joblib.dump(params, os.path.join(MODEL_DIR, file_name)) return os.path.join(MODEL_DIR, file_name) @staticmethod def load_model(file_name): model = SVRForecaster() model.svr.set_params(joblib.load(os.path.join(MODEL_DIR, file_name))) return model
def __init__(self, params): super(%CLASS%, self).__init__(params) tmp = SVR() params = tmp.get_params() for key in params: self.create_new_input(type_="data", label=key, widget_name="std line edit m", widget_pos="besides", pos=-1) del tmp
def grid_search(self, params, data_name): regr = SVR(kernel='rbf', C=1.0, epsilon=0.2) print(regr.get_params().keys()) min_date, max_date, y, X = SVRCalculator.get_data(data_name) X, y, sc_y, next_day, next_week, next_month, sc_X = SVRCalculator.data_normalization( max_date, min_date, y, X) grid_search = GridSearchCV(estimator=regr, param_grid=params, cv=5, n_jobs=4, verbose=0, refit=True) grid_search.fit(X, y.values.ravel()) print("Best parameters for") print(data_name) print(grid_search.best_params_) next_day_prediction = sc_y.inverse_transform( grid_search.predict(next_day)) next_week_prediction = sc_y.inverse_transform( grid_search.predict(next_week)) next_month_prediction = sc_y.inverse_transform( grid_search.predict(next_month)) print("Next day prediction: ", next_day_prediction) print("Next week prediction: ", next_week_prediction) print("Next month prediction: ", next_month_prediction) SVRCalculator.count_errors(y, X, sc_y, grid_search)
def svr(term='poly4'): """ Method to load unfitted SVR models of type modelclass INPUT: term: 'linear', 'poly2' or 'poly4' RETURN: model """ if term is 'linear': regmod = SVR(kernel='linear', gamma='auto_deprecated', C=1.0, epsilon=0.1) # SVR with poly kernel elif term is 'poly2': regmod = SVR(kernel='poly', degree=2, gamma='auto_deprecated', C=1.0, epsilon=0.1) # SVR with poly kernel elif term is 'poly4': regmod = SVR(kernel='poly', degree=4, gamma='auto_deprecated', C=1.0, epsilon=0.1) # SVR with rbf kernel elif term is 'rbf': regmod = SVR(kernel='rbf', gamma='auto_deprecated', C=1.0, epsilon=0.1) else: raise ValueError('Term unknown') utils.display_get_params('SVR Model Description', regmod.get_params()) return(regmod)
def __init__(self, params): super(%CLASS%, self).__init__(params) tmp = SVR() params = tmp.get_params() for key in params: self.create_new_output(type_="data", label=key, pos=-1) del tmp self.create_new_output(type_="data", label="param dict", pos=-1)
class sumSVR(object): def __init__(self, dim=None, *args, **kwargs): self.dim = dim if dim is not None else 1 w = kwargs.pop("w", None) self.kernel_functions = kwargs.pop("kernel_functions", []) if self.kernel_functions is not None: self.kernel_kwargs = kwargs.pop("kernel_kwargs", [{} for i in self.kernel_functions]) else: self.kernel_kwargs = [] kwargs["kernel"] = "precomputed" if w is None: w = np.ones(dim) self.w = w / np.linalg.norm(w) self.x = kwargs.pop('x', None) self.SVR = SVR(*args, **kwargs) def fit(self, x, y): self.x = x kernel_train = np.zeros((x.shape[0], x.shape[0])) for i in range(self.dim): x_i = x[:,i] kernel_i = self.kernel_functions[i](x_i, **self.kernel_kwargs[i]) kernel_train += self.w[i] * kernel_i self.SVR.fit(kernel_train,y) def predict(self, x): kernel_test = np.zeros((x.shape[0], self.x.shape[0])) for i in range(self.dim): x_i = x[:,i] tr_i = self.x[:,i] kernel_i = self.kernel_functions[i](x_i, tr_i, **self.kernel_kwargs[i]) kernel_test += self.w[i] * kernel_i return self.SVR.predict(kernel_test) def get_params(self, deep=False): params = self.SVR.get_params() params['dim'] = self.dim params['w'] = self.w params['kernel_functions'] = self.kernel_functions params['kernel_kwargs'] = self.kernel_kwargs params['x'] = self.x return params def set_params(self, **params): self.__init__(**params) return self
def test_parameters(self): """ Testing parameters of Model class. """ #1.) #create instance of PLS model using Model class & creating instance # using SKlearn libary, comparing if the parameters of both instances are equal pls_parameters = {"n_components": 20, "scale": False, "max_iter": 200} model = Model(algorithm="PlsRegression", parameters=pls_parameters) pls_model = PLSRegression(n_components=20, scale="svd", max_iter=200) for k, v in model.model.get_params().items(): self.assertIn(k, list(pls_model.get_params())) #2.) rf_parameters = {"n_estimators": 200, "max_depth": 50,"min_samples_split": 10} model = Model(algorithm="RandomForest", parameters=rf_parameters) rf_model = RandomForestRegressor(n_estimators=200, max_depth=50, min_samples_split=10) for k, v in model.model.get_params().items(): self.assertIn(k, list(rf_model.get_params())) #3.) knn_parameters = {"n_neighbors": 10, "weights": "distance", "algorithm": "ball_tree"} model = Model(algorithm="KNN", parameters=knn_parameters) knn_model = KNeighborsRegressor(n_neighbors=10, weights='distance', algorithm="kd_tree") for k, v in model.model.get_params().items(): self.assertIn(k, list(knn_model.get_params())) #4.) svr_parameters = {"kernel": "poly", "degree": 5, "coef0": 1} model = Model(algorithm="SVR",parameters=svr_parameters) svr_model = SVR(kernel='poly', degree=5, coef0=1) for k, v in model.model.get_params().items(): self.assertIn(k, list(svr_model.get_params())) #5.) ada_parameters = {"n_estimators": 150, "learning_rate": 1.2, "loss": "square"} model = Model(algorithm="AdaBoost", parameters=ada_parameters) ada_model = AdaBoostRegressor(n_estimators=150, learning_rate=1.2, loss="square") for k, v in model.model.get_params().items(): self.assertIn(k, list(ada_model.get_params())) #6.) bagging_parameters = {"n_estimators": 50, "max_samples": 1.5, "max_features": 2} model = Model(algorithm="Bagging", parameters=bagging_parameters) bagging_model = BaggingRegressor(n_estimators=50, max_samples=1.5, max_features="square") for k, v in model.model.get_params().items(): self.assertIn(k, list(bagging_model.get_params())) #7.) lasso_parameters = {"alpha": 1.5, "max_iter": 500, "tol": 0.004} model = Model(algorithm="lasso", parameters=lasso_parameters) lasso_model = Lasso(alpha=1.5, max_iter=500, tol=0.004) for k, v in model.model.get_params().items(): self.assertIn(k, list(lasso_model.get_params()))
class SVM(): def __init__(self, task='cls', **kwargs): if task == 'cls': self.svm = SVC(**kwargs) self._name = 'SVC' elif task == 'prd': self.svm = SVR(**kwargs) self._name = 'SVR' def decision_function(self, X): ''' X (n_samples, n_features) return: X (n_samples, n_classes * (n_classes-1) / 2) ''' if self._name == 'SVC': return self.svm.decision_function(X) def fit(self, X, y, sample_weight=None): ''' X (n_samples, n_features) y (n_samples,) sample_weight (n_samples,) ''' return self.svm.fit(X, y, sample_weight) def get_params(self, deep=True): return self.svm.get_params(deep) def predict(self, X): return self.svm.predict(X) def score(self, X, y, sample_weight=None): ''' X (n_samples, n_features) y (n_samples,) or (n_samples, n_outputs) sample_weight (n_samples,), default=None ''' return self.svm.score(X, y, sample_weight) def set_params(self, **params): ''' **params dict ''' return self.svm.set_params(**params)
class SVM(): num_amostra_treinamento = None num_amostra_teste = None mae_treinamento = None mae_teste = None def __init__(self, gamma_, C_, epsilon_): self.svm = SVR(kernel="rbf", gamma=gamma_, C=C_, epsilon=epsilon_) def treinar(self, X, Y): self.num_amostra_treinamento = X.shape[0] self.svm.fit(X, Y) resp = self.svm.predict(X) self.mae_treinamento = mean_absolute_error(Y, resp) def testar(self, X, Y): self.num_amostra_teste = X.shape[0] resp = self.svm.predict(X) self.mae_teste = mean_absolute_error(Y, resp) def imprimir(self): params = self.svm.get_params() print("SVM com núcleo RBF") print("C = %f" % params['C']) print("Gamma = %f" % params['gamma']) print("Epsilon = %f" % params['epsilon']) print("Conjunto de Treinamento: %d amostras (%.2f%%)" % (self.num_amostra_treinamento, ((100.0 / (self.num_amostra_treinamento + self.num_amostra_teste)) * self.num_amostra_treinamento))) print("MAE Treinamento: %f" % self.mae_treinamento) print("Conjunto de Teste: %d amostras (%.2f%%)" % (self.num_amostra_teste, ((100.0 / (self.num_amostra_treinamento + self.num_amostra_teste)) * self.num_amostra_teste))) print("MAE Teste: %f" % self.mae_teste)
imax = np.argmin(mses) #fitter = AdaBoostRegressor(n_estimators=50) #fitter = gaussian_process.GaussianProcess() #fitter = LinearRegression() fitter2 = SVR(kernel='rbf',C=cs[imax]) tec_validate_fit = fitter2.fit(data_train,tec_train).predict(data_validate) print fitter.get_params(deep=True) #coefs = fitter.coef_ #print abs(coefs[0:6]).sum() #print abs(coefs[6:12]).sum() #print abs(coefs[12:18]).sum() #print coefs[-1] #MSE: mse = np.mean((tec_validate_fit-tec_validate)**2) #print "smse",np.sqrt(mse) #print fitter.coef_ #plot import matplotlib.pyplot as plt
from sklearn.svm import SVR import numpy as np filename = '2004_2009f.csv' puredata = np.loadtxt(filename, delimiter=',') X = puredata[:, 1:] Y = puredata[:, 0] svr = SVR() print "fitting" svr.fit(X, Y) print "prediction" y_pred = svr.predict(X) list = [] for i in range(len(X)): #print Y[i],y_pred[i],i list.append(y_pred[i] - Y[i]) print "Number of tuples: ", len(X) print "Mean of predictions : ", np.mean(y_pred) print "Standard deviation : ", np.std(list, ddof=1) print svr.get_params(deep=True)
class Baseline: def __init__(self, city, dest_name): self.city = city self.dest_name = dest_name print 'Baseline implementation for {:s} : {:s}'.format( self.city, self.dest_name) dest_to_idx = { 'bofa': 0, 'church': 1, 'gas_station': 3, 'high_school': 3, 'mcdonalds': 4 } self.idx = dest_to_idx[self.dest_name] self.base_dir = osp.join('../data/dataset', city) self.train_label_filename = osp.join(self.base_dir, 'distance', 'train_labels.txt') self.train_im_list_filename = osp.join(self.base_dir, 'distance', 'train_im_list.txt') self.test_label_filename = osp.join(self.base_dir, 'distance', 'test_labels.txt') self.test_im_list_filename = osp.join(self.base_dir, 'distance', 'test_im_list.txt') self.svr = SVR(kernel='linear', shrinking=False, cache_size=10000, verbose=True) # self.svr = LinearSVR(verbose=1) def collect_train_data_parallel(self): with open(self.train_im_list_filename, 'r') as train_f_im,\ open(self.train_label_filename, 'r') as train_f_label: train_im_names = [ osp.join('../data/dataset', l.rstrip().split(' ')[0]) for l in train_f_im ] train_labels = [ float(l.rstrip().split(' ')[self.idx]) for l in train_f_label ] # get dims ge = GISTExtractor(width=256, height=256) im = cv2.imread(train_im_names[0]) gist_features = ge.extract_gist(im) self.train_X = np.zeros((len(train_im_names), gist_features.shape[0]), dtype=np.float) self.train_y = np.asarray(train_labels) # parallel feature extraction! print 'Collecting training data' pool = Pool(initializer=pool_init, initargs=(256, 256)) chunksize = len(train_im_names) / 4 for idx, feat in enumerate( pool.imap(gist_wrapper, train_im_names, chunksize)): self.train_X[idx, :] = feat pool.close() pool.join() def collect_train_data_serial(self): with open(self.train_im_list_filename, 'r') as train_f_im,\ open(self.train_label_filename, 'r') as train_f_label: train_im_names = [ osp.join('../data/dataset', l.rstrip().split(' ')[0]) for l in train_f_im ] train_labels = [ float(l.rstrip().split(' ')[self.idx]) for l in train_f_label ] # get dims ge = GISTExtractor(width=256, height=256) im = cv2.imread(train_im_names[0]) gist_features = ge.extract_gist(im) self.train_X = np.zeros((len(train_im_names), gist_features.shape[0]), dtype=np.float) self.train_y = np.asarray(train_labels) db = lmdb.open('../data/dataset/gist', map_size=int(1e12), readonly=True) txn = db.begin() # serial feature extraction! print 'Collecting training data' for idx, im_name in enumerate(train_im_names): if idx % 100 == 0: print 'Image {:d} / {:d}'.format(idx, len(train_im_names)) key = get_key(im_name) self.train_X[idx, :] = np.fromstring(txn.get(key)) def collect_test_data_parallel(self): with open(self.test_im_list_filename, 'r') as test_f_im,\ open(self.test_label_filename, 'r') as test_f_label: test_im_names = [ osp.join('../data/dataset', l.rstrip().split(' ')[0]) for l in test_f_im ] test_labels = [ float(l.rstrip().split(' ')[self.idx]) for l in test_f_label ] # get dims ge = GISTExtractor(width=256, height=256) im = cv2.imread(test_im_names[0]) gist_features = ge.extract_gist(im) self.test_X = np.zeros((len(test_im_names), gist_features.shape[0]), dtype=np.float) self.test_y = np.asarray(test_labels) # parallel feature extraction! print 'Collecting testing data' pool = Pool(initializer=pool_init, initargs=(256, 256)) chunksize = len(test_im_names) / 4 for idx, feat in enumerate( pool.imap(gist_wrapper, test_im_names, chunksize)): self.test_X[idx, :] = feat pool.close() pool.join() def collect_test_data_serial(self): with open(self.test_im_list_filename, 'r') as test_f_im,\ open(self.test_label_filename, 'r') as test_f_label: test_im_names = [ osp.join('../data/dataset', l.rstrip().split(' ')[0]) for l in test_f_im ] test_labels = [ float(l.rstrip().split(' ')[self.idx]) for l in test_f_label ] # get dims ge = GISTExtractor(width=256, height=256) im = cv2.imread(test_im_names[0]) gist_features = ge.extract_gist(im) self.test_X = np.zeros((len(test_im_names), gist_features.shape[0]), dtype=np.float) self.test_y = np.asarray(test_labels) db = lmdb.open('../data/dataset/gist', map_size=int(1e12), readonly=True) txn = db.begin() # serial feature extraction! print 'Collecting testing data' for idx, im_name in enumerate(test_im_names): if idx % 100 == 0: print 'Image {:d} / {:d}'.format(idx, len(test_im_names)) key = get_key(im_name) self.test_X[idx, :] = np.fromstring(txn.get(key)) def train(self, C=1.0, calc_loss=False): print 'Training with C = {:f}'.format(C) p = self.svr.get_params() p['C'] = C self.svr.set_params(**p) self.svr.fit(self.train_X, self.train_y) loss = 0 if calc_loss: test_y_pred = self.svr.predict(self.test_X) loss = np.linalg.norm(test_y_pred - self.test_y) # score = self.svr.score(self.test_X, self.test_y) print 'Loss = {:f}'.format(loss) return loss def cross_validate(self): C = np.power(10.0, xrange(-2, 5)) losses = np.array([self.train(c, calc_loss=True) for c in C]) idx = np.argmin(losses) print 'Best C = {:f}'.format(C[idx]) def save_current_model(self): model_filename = osp.join(self.base_dir, 'distance', '{:s}.pkl'.format(self.dest_name)) joblib.dump(self.svr, model_filename) print model_filename, 'saved'
def learn_models(model_names, features_to_use): """ This version splits original texts in dataset for evaluating summaries """ dataset_features = utilities.load_features('CNN') features, targets, labels, documents, all_vec = utilities.split_dataset( dataset_features, features_to_use, 0.28, 'CNN') #return utilities.write_dataset_csv(dataset_features, '/tmp/test.csv') ''' cPickle.dump(features, open('features.pkl', 'wb')) cPickle.dump(targets, open('targets.pkl', 'wb')) cPickle.dump(labels, open('labels.pkl', 'wb')) cPickle.dump(documents, open('documents.pkl', 'wb')) cPickle.dump(all_vec, open('all_vec.pkl', 'wb')) features = cPickle.load(open('features.pkl', 'rb')) targets = cPickle.load(open('targets.pkl', 'rb')) labels = cPickle.load(open('labels.pkl', 'rb')) documents = cPickle.load(open('documents.pkl', 'rb')) all_vec = cPickle.load(open('all_vec.pkl', 'rb')) ''' X_normal = np.array(all_vec) #X_normal = utilities.select_features(features_to_use, X_normal) # X_normal = StandardScaler().fit_transform(dataset[0]) utilities.normalize_dataset(X_normal, features_to_use, 'learn') X_train = np.array(features['train']) X_test = np.array(features['test']) y_train = np.array(targets['train']) y_test = np.array(targets['test']) labels_train = np.array(labels['train']) labels_test = np.array(labels['test']) #X_train = utilities.select_features(features_to_use, X_train) #utilities.normalize_dataset(X_train, features_to_use) #X_test = utilities.select_features(features_to_use, X_test) #utilities.normalize_dataset(X_test, features_to_use) print("Dataset size: {}".format(len(all_vec))) #(X_balanced, y_balanced, labels_balanced) = (X_train, y_train, labels_train) X_balanced, y_balanced, labels_balanced = utilities.balance_dataset( X_train, y_train, labels_train, 1) print("Used features: " + ','.join(features_to_use)) print("Train set size: {}".format(len(X_balanced))) print("Number of True/False labels: {}/{}".format( sum(labels_balanced), sum(1 for i in labels_balanced if not i))) print("Test set size: {}".format(len(X_test))) print("Number of True/False labels: {}/{}".format( sum(labels_test), sum(1 for i in labels_test if not i))) print("Used features: {}".format(len(X_balanced[0]))) dataset_json = json.loads( utilities.read_file('resources/CNN/documents.json')) test_documents = {int(key): dataset_json[key] for key in documents['test']} is_regressor = True for model_type in model_names: print('**********************' + model_type + '**********************') if model_type == 'dtr': # max_depth=6 regr = tree.DecisionTreeRegressor(criterion='friedman_mse') regr = regr.fit(X_balanced, y_balanced) print(regr.get_params()) export_name = 'dtr' elif model_type == 'linear': regr = linear_model.LinearRegression() # Train the model using the training sets regr.fit(X_balanced, y_balanced) # The coefficients print('Coefficients: \n', regr.coef_) export_name = 'linear' elif model_type == 'svm': regr = SVR(kernel='rbf', degree=7, verbose=False, epsilon=0.000001, gamma='scale', tol=.0000001, shrinking=True) # Train the model using the training sets regr.fit(X_balanced, y_balanced) # The coefficients print('Coefficients: \n', regr.get_params()) export_name = 'svm' elif model_type == 'dummy': regr = RndRegressor() export_name = 'dummy' is_regressor = False elif model_type == 'ideal': from IdealRegressor import IdealRegressor regr = IdealRegressor(X_train, y_train) #regr.predict(X_train) regr.fit(X_test, y_test) #regr.predict(X_test) export_name = 'ideal' elif model_type == 'nb': #from sklearn import svm #regr = svm.SVC(gamma='scale').fit(X_train, labels_train) from sklearn.naive_bayes import ComplementNB, GaussianNB from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier regr = ComplementNB(alpha=0.015) regr.fit(X_train, labels_train) is_regressor = False export_name = 'nb' else: print("Regression type is undefined:" + model_type) continue # Make predictions using the testing set model_results = Learn.evaluate_model(regr, X_test, X_balanced, y_test, y_balanced, labels_test, labels_balanced, is_regressor) print('Summarizing dataset and evaluating Rouge...') rouge_scores = evaluate_summarizer(regr, test_documents, features_to_use, True) utilities.print_rouges(rouge_scores) utilities.export_model(regr, export_name) print( '*****************************************************************************' ) return rouge_scores, model_results
svm_reg.fit(fires_prepared, fires_labels) #dt from sklearn.tree import DecisionTreeRegressor tree_reg = DecisionTreeRegressor(max_depth=2, random_state=42) tree_reg.fit(fires_prepared, fires_labels) #rf from sklearn.ensemble import RandomForestRegressor forest_reg = RandomForestRegressor(random_state=42) forest_reg.fit(fires_prepared, fires_labels) #2-1 from sklearn.model_selection import GridSearchCV print("sgd_reg.get_params().keys(): ", sgd_reg.get_params().keys()) print("svm_reg.get_params().keys(): ", svm_reg.get_params().keys()) print("tree_reg.get_params().keys(): ", tree_reg.get_params().keys()) print("forest_reg.get_params().keys(): ", forest_reg.get_params().keys()) params_sgd = [ { 'alpha': [0.1, 0.5], 'epsilon': [0.1, 1] }, { 'alpha': [0.5, 0.6], 'epsilon': [0.1, 0.7] }, ] params_svm = {
print (np.sum((y_pred - y)** 2)/len(X)) print clf.score(X,y) # print clf.best_estimator_ # print clf.best_score_ # print clf.best_params_ # print clf.cv_results_ print clf.support_ print clf.support_vectors_ # print clf.coef_ # y_pred = clf.predict(X) # print y # print y_pred # print (np.sum((y_pred - y)** 2)/len(X)) # print clf.score(X,y) print clf.get_params() # joblib.dump(clf, "rbf_SVR_100k_4_111.pkl") data, nrows, ncols = readDataSet("YearPredictionMSDTest10.txt") X = data[:,1:91] y = data[:,0] clfp = PCA(n_components = 4) X = clfp.fit_transform(X) X = StandardScaler().fit_transform(X) y_pred = clf.predict(X) print y_pred, y print (np.sum((y_pred - y)** 2)/len(X)) print clf.score(X,y)
class Trainer(): def __init__(self): with open('credentials.json') as credentials_file: credentials = json.load(credentials_file) passwd = credentials['mysql']['password'] self.con = mdb.connect(host='127.0.0.1', port=3306, user='******', passwd=passwd, db='insight', autocommit=True) self.cur = self.con.cursor() print "Connected to database" self.load_data() def load_data(self): f = open('./pickles/mysql_dump.pickle', 'rb') self.loanData = pickle.load(f) self.loanData = pd.DataFrame(self.loanData) f.close() def drop_na(self): self.loanData = loanData.dropna() self.loanData.index = range(len(self.loanData)) def drop_columns(self): #drop the columns with malformed data in mysql db self.loanData = self.loanData.drop(['none', 'educational', 'IA', 'IDAHO', 'ME', 'NE', 'other_housing', 'issue_year'], 1) def drop_prepaid_loans(self): indices_to_drop = [] for i in range(len(self.loanData)): if self.loanData['loan_status'][i]==1 and self.loanData['days_to_zero_dollars'][i] < 1000: indices_to_drop.append(i) self.loanData = self.loanData.drop(indices_to_drop, 0) print "Number of prepaid loans: ", len(indices_to_drop) print "Number of loans after dropping prepaids: ", len(self.loanData) def define_features_targets(self, kind="regression"): #take out 1000 random loans with 36 month terms for testing #ids are already populated in test_loans for consistency test_ids = [] sql_query = "select id from test_loans;" self.cur.execute(sql_query) sql_resp = self.cur.fetchall() print "length of sql response: ", len(sql_resp) for val in sql_resp: test_ids.append(val[0]) print "length of test_ids: ", len(test_ids) #make the test and train data frames self.testLoanData = self.loanData[self.loanData['id'].isin(test_ids)] self.trainLoanData = self.loanData[~self.loanData['id'].isin(test_ids)] self.testLoanData.index = range(len(self.testLoanData)) self.trainLoanData.index = range(len(self.trainLoanData)) print "Train Loan Data: ", len(self.trainLoanData) print "Test Loan Data: ", len(self.testLoanData) self.features = self.trainLoanData.drop(['loan_status', 'days_to_zero_dollars', 'id'], 1) self.features = self.features.values #choose different target variables for regression vs classification if kind == "regression": self.targets = self.trainLoanData['days_to_zero_dollars'].values self.y_test = self.testLoanData['days_to_zero_dollars'].values elif kind == "classification": self.targets = self.trainLoanData['loan_status'].values self.y_test = self.testLoanData['loan_status'].values def preprocess(self): (self.X_train, self.X_cv, self.y_train, self.y_cv) = dm.split_train_test(features=self.features, targets=self.targets, test_size=0.1) self.X_test = self.testLoanData.drop(['loan_status', 'days_to_zero_dollars', 'id'], 1).values (self.X_train, self.X_cv) = dm.standardize_samples(self.X_train, self.X_cv) (self.X_train, self.X_cv) = dm.scale_samples_to_range(self.X_train, self.X_cv) (self.X_test, _) = dm.standardize_samples(self.X_test, self.X_test) (self.X_test, _) = dm.scale_samples_to_range(self.X_test, self.X_test) def define_dummy_classifier(self): self.clf = DummyClassifier() def define_rfr(self, n_estimators=10): self.regr = RandomForestRegressor(n_estimators=n_estimators, oob_score=True) print self.regr.get_params() def define_linear_regressor(self): self.regr = LinearRegression() print self.regr.get_params() def define_SVR(self, C=1, gamma=0.1): self.regr = SVR(C=C, gamma=gamma, verbose=3) print self.regr.get_params() def define_logistic_regressor(self, penalty="l2", C=1.0, class_weight=None): self.clf = LogisticRegression(penalty=penalty, C=C, class_weight=class_weight) print self.clf.get_params() def define_rfc(self, n_estimators=10): self.clf = RandomForestClassifier(n_estimators=n_estimators, oob_score=True) print self.clf.get_params() def train(self, kind="regression"): print "Fitting training data" if kind == "regression": self.regr.fit(self.X_train, self.y_train) elif kind == "classification": self.clf.fit(self.X_train, self.y_train) def predict(self, X, kind="regression"): if kind == "regression": self.prediction = self.regr.predict(X) elif kind == "classification": self.prediction = self.clf.predict(X) def score(self, X, y, kind="regression"): if kind == "regression": score_val = self.regr.score(X, y) print "R2 Score: ", score_val elif kind == "classification": score_val = self.clf.score(X, y) print "Accuracy: ", score_val print classification_report(y, self.prediction) self.precision = precision_score(y, self.prediction, labels=[0,1,2], average=None) print "\n\nPrecision Score: ", self.precision, "\n\n" self.accuracy = accuracy_score(y, self.prediction) def test(self, kind="regression"): #run clf and regr on the test data to determine to top 100 loans #the top loans are the ones least likely to default if kind == "regression": pred = self.regr.predict(self.X_test) print "length of regression pred: ", len(pred) for i, loan in enumerate(self.testLoanData['id']): sql_query = "UPDATE test_loans SET pred_days_to_zero_dollars=%s where id='%s';" %( pred[i], self.testLoanData['id'][i]) self.cur.execute(sql_query) print i elif kind == "classification": pred_proba = self.clf.predict_proba(self.X_test) for i, loan in enumerate(self.testLoanData['id']): sql_query = "UPDATE test_loans SET pred_default=%s, pred_paid=%s, pred_prepaid=%s where id='%s';" %( pred_proba[i][0], pred_proba[i][1],pred_proba[i][2], self.testLoanData['id'][i]) self.cur.execute(sql_query) self.con.close() def run_pca(self, n_components=20): self.pca = PCA(n_components=n_components) self.X_train = self.pca.fit_transform(self.X_train) print "Reduced data down to ", self.pca.n_components_, " dimensions: " print "Transforming cv data ..." self.X_cv = self.pca.transform(self.X_cv) print "Transforming test data ..." self.X_test = self.pca.transform(self.X_test) def plot_prediction(self): plt.scatter(self.prediction, self.y_cv) plt.xlabel('prediction') plt.ylabel('y_test') plt.show() def runSVRGridSearch(self): C_vals = [0.01, 0.1, 1, 10, 100] gamma_vals = [1E-2, 1E-1, 1, 1E1, 1E2, 1E3, 1E4] for C in C_vals: for gamma in gamma_vals: print "\n\n C: ", C, " gamma: ", gamma self.define_SVR(C=C, gamma=gamma) self.train() print "Training Scores:" self.predict(self.X_train) self.score(self.X_train, self.y_train) print "Testing Scores:" self.predict(self.X_cv) self.score(self.X_cv, self.y_cv) def roc(self): '''Compute ROC curve using one-vs-all technique''' pred_proba = self.clf.predict_proba(self.X_cv) fpr = [] tpr = [] thresholds = [] for i in [0, 1, 2]: fpr_i, tpr_i, thresholds_i = roc_curve(self.y_cv, pred_proba[:,i], pos_label=i) fpr.append(fpr_i) tpr.append(tpr_i) thresholds.append(thresholds_i) print "AUC: ", auc(fpr_i, tpr_i) plt.plot([0,1], [0,1], '--', color=(0.6, 0.6, 0.6)) plt.plot(fpr[0], tpr[0], label="Default", linewidth=3) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.show() def pickle_algo(self, X, fileName): print "pickling algorithm" f = open(fileName, 'wb') pickle.dump(X, f) f.close()
def run_experiments_without_cross_validation(model_names, features_to_use): dataset_features = utilities.load_features('pasokh') features, targets, labels, documents, all_vec = utilities.split_dataset(dataset_features, features_to_use, 0.40) X_normal = np.array(all_vec) utilities.normalize_dataset(X_normal, features_to_use, 'learn') X_train = np.array(features['train']) X_test = np.array(features['test']) y_train = np.array(targets['train']) y_test = np.array(targets['test']) labels_train = np.array(labels['train']) labels_test = np.array(labels['test']) print("Dataset size: {}".format(len(X_normal))) #print("Number of True/False labels: {}/{}".format(sum(labels), sum(1 for i in labels if not i))) (X_balanced, y_balanced, labels_balanced) = (X_train, y_train, labels_train) #X_balanced, y_balanced, labels_balanced = utilities.balance_dataset(X_train, y_train, labels_train, 3) print("Train set size: {}".format(len(X_balanced))) print("Number of True/False labels: {}/{}".format(sum(labels_balanced), sum(1 for i in labels_balanced if not i))) print("Test set size: {}".format(len(X_test))) print("Number of True/False labels: {}/{}".format(sum(labels_test), sum(1 for i in labels_test if not i))) print("Used features: {}".format(len(X_balanced[0]))) dataset_json = json.loads(utilities.read_file('resources/pasokh/all.json')) is_regressor = True for model_type in model_names: print('**********************' + model_type + '**********************') if model_type == 'dtr': # max_depth=6 regr = tree.DecisionTreeRegressor() regr = regr.fit(X_balanced, y_balanced) export_name = 'dtr' elif model_type == 'linear': regr = linear_model.LinearRegression(normalize=True) # Train the model using the training sets regr.fit(X_balanced, y_balanced) # The coefficients print('Coefficients: \n', regr.coef_) export_name = 'linear' elif model_type == 'svm': regr = SVR(verbose=True, epsilon=0.00001, gamma='auto', tol=.00001) # Train the model using the training sets regr.fit(X_balanced, y_balanced) # The coefficients print('Coefficients: \n', regr.get_params()) export_name = 'svm' elif model_type == 'dummy': regr = RndRegressor() export_name = 'dummy' elif model_type == 'ideal': from IdealRegressor import IdealRegressor regr = IdealRegressor(X_train, y_train) regr.fit(X_test, y_test) export_name = 'ideal' elif model_type == 'nb': # from sklearn import svm # regr = svm.SVC(gamma='scale').fit(X_train, labels_train) from sklearn.naive_bayes import ComplementNB, GaussianNB from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier regr = ComplementNB(alpha=1) regr.fit(X_train, labels_train) is_regressor = False export_name = 'nb' else: print("Regression type is undefined:" + model_type) continue # Make predictions using the testing set model_results = evaluate_model(regr, X_test, X_balanced, y_test, y_balanced, labels_test, labels_balanced, is_regressor) print('Summarizing dataset and evaluating Rouge...') rouge_scores = evaluate_summarizer(regr, dataset_json, features_to_use, True) utilities.print_rouges(rouge_scores) print('*****************************************************************************') return rouge_scores, model_results
print '' svc = SVC(gamma=0.001, kernel='linear') print 'SVC config:' print svc.get_params() svc.fit(smr_train.feature_matrix, smr_train.labels) svc_score_train = svc.score(smr_train.feature_matrix, smr_train.labels) print 'SVC precision train: {}'.format(svc_score_train) svc_score_test = svc.score(smr_test.feature_matrix, smr_test.labels) print 'SVC precision test: {}'.format(svc_score_test) # plot_learning_curve(svc, 'SVC Curve', smr_train.feature_matrix, smr_train.labels, n_jobs=4) print '' svr = SVR() print 'SVR config:' print svr.get_params() svr.fit(smr_train.feature_matrix, smr_train.labels) svr_score_train = svr.score(smr_train.feature_matrix, smr_train.labels) print 'SVR precision train: {}'.format(svr_score_train) svr_score_test = svr.score(smr_test.feature_matrix, smr_test.labels) print 'SVR precision test: {}'.format(svr_score_test) # plot_learning_curve(svr, 'SVR Curve', smr_train.feature_matrix, smr_train.labels, n_jobs=4) print '' lsvc = LinearSVC() print 'LinearSVC config:' print lsvc.get_params() lsvc.fit(smr_train.feature_matrix, smr_train.labels) lsvc_score_train = lsvc.score(smr_train.feature_matrix, smr_train.labels) print 'LinearSVC precision train: {}'.format(lsvc_score_train) lsvc_score_test = lsvc.score(smr_test.feature_matrix, smr_test.labels)
#result_X = preprocessing.scale(result_X) X_feats = result_X[:, 2:] X_target = result_X[:, 1] #pca = PCA(100) #X_feats = pca.fit_transform(X_feats) ############clf = sv reg############### clf = SVR(C = 1.0, epsilon = 0.2) clf.fit(X_feats, X_target) X_test = X_feats Y_test = X_target ############ predicted_x = clf.predict(X_test) a = normalized_gini(predicted_x, Y_test) print a#("GINI Score : ", a) P = clf.get_params() #np.savetxt('svr1.txt', clf.coef_) #result_X = np.column_stack(result + [[1]*len(result[0])]) #beta_hat = np.linalg.lstsq(result[1:[1, 2, 3]], result[1:,[0]].T)[0] #print clf.coef_ with open('svr1.csv', 'wb') as csvfile: swriter = csv.writer(csvfile, delimiter=',') swriter.writerow([x for x in P])
def standard_experiment(train_df, test_df, feature_names, args): train_df['set'] = "train" # annotate test_df['set'] = "test" # annotate # clip training set, if necessary if (0 < args.limit_data < len(train_df)): print "Clipping training set to %d comments" % args.limit_data train_df = train_df[:args.limit_data] # Split into X, y for regression target = args.target train_X = train_df.filter(feature_names).as_matrix().astype( np.float) # training data train_y = train_df.filter([target]).as_matrix().astype( np.float) # training labels test_X = test_df.filter(feature_names).as_matrix().astype( np.float) # test data test_y = test_df.filter([target ]).as_matrix().astype(np.float) # ground truth # For compatibility, make 1D train_y = train_y.reshape((-1, )) test_y = test_y.reshape((-1, )) print "Training set: %d examples" % (train_X.shape[0], ) print "Test set: %d examples" % (test_X.shape[0], ) print "Selected %d features" % (len(feature_names), ) print 'Features: %s' % (' '.join(feature_names)) ## # Preprocessing: scale data, keep SVM happy scaler = preprocessing.StandardScaler() train_X = scaler.fit_transform( train_X) # faster than fit, transform separately test_X = scaler.transform(test_X) if args.classifier != 'baseline': if args.stock_params: if args.classifier == 'svr': print "Initializing SVR model" clf = SVR(**STANDARD_PARAMS['svr']) elif args.classifier == 'rf': print "Initializing RandomForestRegressor model, seed=%d" % args.rfseed clf = RandomForestRegressor(random_state=args.rfseed, **STANDARD_PARAMS['rf']) elif args.classifier == 'elasticnet': print "Initializing ElasticNet model" clf = ElasticNet(max_iter=10000, **STANDARD_PARAMS['elasticnet']) else: raise ValueError("Invalid classifier '%s' specified." % args.classifier) else: ## # Run Grid Search / 10xv on training/dev set start = time.time() print "== Finding optimal classifier using Grid Search ==" params, clf = train_optimal_classifier(train_X, train_y, classifier=args.classifier, rfseed=args.rfseed, quickmode=args.quickmode) print "Optimal parameters: " + json.dumps(params, indent=4) if hasattr(clf, "support_vectors_"): print 'Number of support vectors: %d' % len( clf.support_vectors_) print "Took %.2f minutes to train" % ((time.time() - start) / 60.0) if hasattr(clf, 'random_state'): clf.set_params(random_state=args.rfseed) clf.fit(train_X, train_y) params = clf.get_params() ## # Set up evaluation function if args.ndcg_weight == 'target': favfunc = evaluation.fav_target # score weighting else: favfunc = evaluation.fav_linear # rank weighting max_K = 20 eval_func = lambda data: evaluation.ndcg(data, max_K, target=args.ndcg_target, result_label=result_label, fav_func=favfunc) ## # Predict scores for training set result_label = "pred_%s" % args.target # e.g. pred_score if args.classifier != 'baseline': train_pred = clf.predict(train_X) else: # baseline: post order train_pred = -1 * train_df['position_rank'] train_df[result_label] = train_pred print 'Performance on training data (NDCG with %s weighting)' % args.ndcg_weight # ndcg_train = eval_func(train_df) ndcg_train = eval_func( train_df[train_df.parent_nchildren >= args.min_posts_ndcg]) for i, score in enumerate(ndcg_train, start=1): print '\tNDCG@%d: %.5f' % (i, score) print 'Karma MSE: %.5f' % mean_squared_error(train_y, train_pred) ## # Predict scores for test set if args.classifier != 'baseline': test_pred = clf.predict(test_X) else: # baseline: post order test_pred = -1 * test_df['position_rank'] test_df[result_label] = test_pred print 'Performance on test data (NDCG with %s weighting)' % args.ndcg_weight # ndcg_test = eval_func(test_df) ndcg_test = eval_func( test_df[test_df.parent_nchildren >= args.min_posts_ndcg]) for i, score in enumerate(ndcg_test, start=1): print '\tNDCG@%d: %.5f' % (i, score) print 'Karma MSE: %.5f' % mean_squared_error(test_y, test_pred) ## # Save model to disk if args.savename and (args.classifier != 'baseline'): import cPickle as pickle saveas = args.savename + ".model.pkl" print "== Saving model as %s ==" % saveas with open(saveas, 'w') as f: pickle.dump(clf, f) ## # Get feature importance, if possible if args.savename and (args.classifier != 'baseline'): feature_importances = get_feature_importance( clf, args.classifier, feature_names=feature_names, sorted=True) saveas = args.savename + ".topfeatures.txt" print "== Recording top features to %s ==" % saveas # np.savetxt(saveas, feature_importances) # with open(saveas, 'w') as f: # json.dump(feature_importances, f, indent=2) with open(saveas, 'w') as f: maxlen = max([len(fname) for fname in feature_importances[0]]) f.write("# Model: %s\n" % args.classifier) f.write("# Params: %s\n" % json.dumps(params)) for fname, val in zip(*feature_importances): f.write("%s %.06f\n" % (fname.ljust(maxlen), val)) f.flush() ## # Save data to HDF5 if args.savename: # Save score predictions fields = [ "self_id", "parent_id", 'cid', 'sid', 'set', args.target, result_label ] if not args.ndcg_target in fields: fields.append(args.ndcg_target) saveas = args.savename + ".scores.h5" print "== Saving raw predictions as %s ==" % saveas outdf = pd.concat([train_df[fields], test_df[fields]], ignore_index=True) outdf.to_hdf(saveas, 'data') if args.savefull: # Concatenate train, test df = pd.concat([train_df, test_df], ignore_index=True) print "== Exporting data to HDF5 ==" saveas = args.savename + ".data.h5" df.to_hdf(saveas, "data") print " [saved as %s]" % saveas # Save NDCG calculations dd = { 'k': range(1, max_K + 1), 'method': [args.ndcg_weight] * max_K, 'ndcg_train': ndcg_train, 'ndcg_test': ndcg_test } resdf = pd.DataFrame(dd) saveas = args.savename + ".results.csv" print "== Saving results to %s ==" % saveas resdf.to_csv(saveas)
else: noOfInstances += 1 values = line.split('\n')[0] values = values.split(',') tempValues.extend(values[:noOfAttrs - 1]) outputs.append(values[noOfAttrs - 1]) for i in range(noOfInstances): for j in range(noOfAttrs - 1): row_ind.append(i) col_ind.append(j) tempValues = list(map(float, tempValues)) dataset = sparse.coo_matrix((tempValues, (row_ind, col_ind))).toarray() outputs = list(map(float, outputs)) return dataset, outputs trainingDataset, trainingoutputs = getDataset('FileName_training.csv') svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) print(svr_rbf.get_params()) print('Training Started....') train_rbf = svr_rbf.fit(trainingDataset, trainingoutputs) joblib.dump(train_rbf, 'Model.pkl') '''svr_poly = SVR(kernel='poly', C=1e3, degree=2) print(svr_poly.get_params()) print('Training Started....') train_poly = svr_poly.fit(trainingDataset,trainingoutputs) joblib.dump(train_poly,'Model.pkl')''' print('Model Saved....')
test_set.append([predictiveAttributeDegree[i][11], predictiveAttributeDegree[i][13]]) test_result.append([predictiveAttributeDegree[i][2]]) train_percent = (len(predictiveAttributeNotDegree)/100)*80 count = 0 for i in range(len(predictiveAttributeNotDegree)): if count < train_percent: count = count + 1 train_set.append([predictiveAttributeNotDegree[i][11], predictiveAttributeNotDegree[i][13]]) train_result.append([predictiveAttributeNotDegree[i][2]]) else: test_set.append([predictiveAttributeNotDegree[i][11], predictiveAttributeNotDegree[i][13]]) test_result.append([predictiveAttributeNotDegree[i][2]]) svm_reg = SVR(kernel="poly", degree=2, C=50, epsilon=1.0, gamma="auto") train_result = np.array(train_result) svm_reg.fit(train_set, train_result.ravel()) print(svm_reg.score(test_set, test_result)) prediction = [] for item in test_set: items = [[item[0], item[1]]] prediction.append(svm_reg.predict(items)) pred = np.zeros(len(prediction)) predi = np.array(prediction) for i in range(len(prediction)): pred[i] = predi[i][0] print(("MSE: {}".format(mean_squared_error(pred, test_result)))) print("Params: ", svm_reg.get_params())
def run_experiments(model_names): """ This version splits original texts in dataset for evaluating summaries """ valid_features = ['cue_words', 'tfisf', 'cosine_position', 'relative_len', # 'tf', #'pos_ve_ratio', 'pos_aj_ratio', 'pos_nn_ratio', 'pos_av_ratio', 'pos_num_ratio', 'len', 'position' 'doc_words', 'doc_sens',# 'doc_parag', # 'category', #'doc_verbs', 'doc_adjcs', 'doc_advbs', 'doc_nouns', 'nnf_isnnf', 'vef_isvef', 'ajf_isajf', 'avf_isavf', 'nuf_isnuf', 'political', 'social', 'sport', 'culture', 'economy', 'science' ] features, targets, labels, documents, all_vec = load_dataset_splitted('features.json', learning_features) X_normal = np.array(all_vec) X_normal = select_features(valid_features, X_normal) # X_normal = StandardScaler().fit_transform(dataset[0]) normalize_dataset(X_normal, valid_features, 'learn') X_train = np.array(features['train']) X_test = np.array(features['test']) y_train = np.array(targets['train']) y_test = np.array(targets['test']) labels_train = np.array(labels['train']) labels_test = np.array(labels['test']) X_train = select_features(valid_features, X_train) normalize_dataset(X_train, valid_features) X_test = select_features(valid_features, X_test) normalize_dataset(X_test, valid_features) print("Dataset size: {}".format(len(all_vec))) (X_balanced, y_balanced, labels_balanced) = (X_train, y_train, labels_train) #X_balanced, y_balanced, labels_balanced = balance_dataset(X_train, y_train, labels_train, 3) print("Train set size: {}".format(len(X_balanced))) print("Number of True/False labels: {}/{}".format(sum(labels_balanced), sum(1 for i in labels_balanced if not i))) print("Test set size: {}".format(len(X_test))) print("Number of True/False labels: {}/{}".format(sum(labels_test), sum(1 for i in labels_test if not i))) print("Used features: {}".format(len(X_balanced[0]))) dataset_json = json.loads(read_file('resources/pasokh/all.json')) test_documents = {key: dataset_json[key] for key in documents['test']+documents['train']} is_regressor = True for model_type in model_names: print('**********************' + model_type + '**********************') if model_type == 'dtr': # max_depth=6 regr = tree.DecisionTreeRegressor(max_depth=6) regr = regr.fit(X_balanced, y_balanced) export_name = 'dtr' elif model_type == 'linear': regr = linear_model.LinearRegression() # Train the model using the training sets regr.fit(X_balanced, y_balanced) # The coefficients print('Coefficients: \n', regr.coef_) export_name = 'linear' elif model_type == 'svm': regr = SVR(verbose=True, epsilon=0.001, gamma='auto', tol=.00001) # Train the model using the training sets regr.fit(X_balanced, y_balanced) # The coefficients print('Coefficients: \n', regr.get_params()) export_name = 'svm' elif model_type == 'dummy': regr = RndRegressor() export_name = 'dummy' elif model_type == 'ideal': from IdealRegressor import IdealRegressor regr = IdealRegressor(X_normal, targets) export_name = 'ideal' elif model_type == 'nb': # from sklearn import svm # regr = svm.SVC(gamma='scale').fit(X_train, labels_train) from sklearn.naive_bayes import ComplementNB, GaussianNB from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier regr = ComplementNB(alpha=0.015) regr.fit(X_train, labels_train) is_regressor = False export_name = 'nb' else: print("Regression type is undefined:" + model_type) continue # Make predictions using the testing set model_results = evaluate_model(regr, X_test, X_balanced, y_test, y_balanced, labels_test, labels_balanced, is_regressor) print('Summarizing dataset and evaluating Rouge...') rouge_scores = evaluate_summarizer(regr, test_documents, valid_features) utilities.print_rouges(rouge_scores) print('*****************************************************************************') return rouge_scores, model_results
def main(): #picklef = open(config_file, 'r') #config_dict = pickle.load(picklef) print "\n=========================" print "SURROGATE MODEL GENERATOR" print "=========================" print "PARSE AND CLEAN DATA" print "=========================" # load design and target data into a pandas dataframe from the input csv dataframe = pd.read_csv(input_data_file) # drop rows (samples) with NaNs in them dataframe = dataframe[dataframe.isnull() == False] # split the dataframe into design and target dataframes design_data = dataframe[features] design_labels = design_data.axes target_data = dataframe[targets] target_labels = target_data.axes if DEBUG: print "\nFeatures:\n", design_data print "\nTargets:\n", target_data print "\nParsed data shapes\n design data: ", np.shape( design_data), "\n target data: ", np.shape(target_data) print " #samples: %d\n #input parameters: %d" % (np.shape(design_data)[0], np.shape(design_data)[1]) print " #output parameters: %d" % np.shape(target_data)[1] if DEBUG: print "design data:" print design_data print "target_data:" print target_data if test_split > 0.0: print "\n=========================" print "SPLIT TRAIN AND TEST DATASETS" print "=========================" # split the data into a training set and a testing set for validation later. X_train, X_test, Y_train, Y_test = cross_validation.train_test_split( design_data, target_data, test_size=test_split) print "\nX_train, Y_train:", np.shape(X_train), np.shape(Y_train) print "X_test, Y_test:", np.shape(X_test), np.shape(Y_test) print "training sample size: %d" % np.shape(X_train)[0] print "testing sample size: %d" % np.shape(X_test)[0] if DEBUG: print "X_train:\n", X_train print "Y_train:\n", Y_train else: X_train = design_data Y_train = target_data X_test, Y_test = [], [] # standardize the training data to mean 0 and variance 1 if normalize is True: print "\n=========================" print "DATA NORMALIZATION AND SCALING" print "=========================" # initialize a StandardScaler object to calculate the means and scaling values of each design # parameter (that is, it calculates the means and stdevs over the columns). # We then use the scaler object to transform the entire input data set (except for the design ID # number) to their normalized values. X_train_scaler = preprocessing.MinMaxScaler( feature_range=(0, 1)).fit(X_train) X_train_scaled = pd.DataFrame(X_train_scaler.transform(X_train), columns=X_train.axes[1]) if test_split > 0.0: X_test_scaler = preprocessing.MinMaxScaler( feature_range=(0, 1)).fit(X_test) X_test_scaled = pd.DataFrame(X_test_scaler.transform(X_test), columns=X_test.axes[1]) else: X_test_scaled = [] print "\n feature min: ", X_train_scaler.data_min_ print " feature max: ", X_train_scaler.data_max_ print " feature range: ", X_train_scaler.data_range_ print " feature scales: \n", X_train_scaler.scale_ print "\nScaled training inputs:" print " shape: ", np.shape(X_train_scaled) if DEBUG: print "\n X_train_scaled:\n", X_train_scaled print "\nScaled testing inputs:" print " shape:", np.shape(X_test_scaled) print "\n X_test_scaled:\n", X_test_scaled Y_train_scaler = preprocessing.MinMaxScaler( feature_range=(0, 1)).fit(Y_train) Y_train_scaled = pd.DataFrame(Y_train_scaler.transform(Y_train), columns=Y_train.axes[1]) if test_split > 0.0: Y_test_scaler = preprocessing.MinMaxScaler( feature_range=(0, 1)).fit(Y_test) Y_test_scaled = pd.DataFrame(Y_test_scaler.transform(Y_test), columns=Y_test.axes[1]) else: Y_test_scaled = [] print "\n output min: ", Y_train_scaler.data_min_ print " output max: ", Y_train_scaler.data_max_ print " output range: ", Y_train_scaler.data_range_ print " output scales: \n", Y_train_scaler.scale_ print "\nScaled training inputs:" print " shape: ", np.shape(Y_train_scaled) if DEBUG: print "\n Y_train_scaled:\n", Y_train_scaled print "\nScaled testing inputs:" print " shape:", np.shape(Y_test_scaled) print "\n Y_test_scaled:\n", Y_test_scaled #print "\nBefore scaling:" #print np.shape(X_train) #print X_train # This is just for visualizing the normalization transformations with histograms if DEBUG is True and 1: fig, axes = plt.subplots(np.shape(X_train)[1], sharex=True, sharey=True) for ax, label in izip(axes, X_train.axes[1]): ax.hist(X_train[label], bins=7) ax.set_title(label) fig.suptitle( "Distribution of design parameters before normalization") fig, axes = plt.subplots(np.shape(X_train_scaled)[1], sharex=True, sharey=True) print X_train_scaled.axes for ax, label in izip(axes, X_train_scaled.axes[1]): ax.hist(X_train_scaled[label], bins=7) ax.set_title(label) fig.suptitle( "Distribution of design parameters after normalization") if len(Y_train) is not 0 and len(Y_train_scaled) is not 0: fig, axes = plt.subplots(np.shape(Y_train)[1], sharex=True, sharey=True) for ax, label in izip(axes, Y_train.axes[1]): ax.hist(Y_train[label], bins=7) ax.set_title(label) fig.suptitle( "Distribution of performance parameters before normalization" ) fig, axes = plt.subplots(np.shape(Y_train_scaled)[1], sharex=True, sharey=True) for ax, label in izip(axes, Y_train_scaled.axes[1]): ax.hist(Y_train_scaled[label], bins=7) ax.set_title(label) fig.suptitle( "Distribution of performance parameters after normalization" ) plt.show() else: X_train_scaled = X_train X_test_scaled = X_test print "\n=========================" print "SUPPORT VECTOR REGRESSION" print "=========================" surrogate_models = [ ] # Array to hold the surrogate model objects for each output parameter # If gridsearch is True, use scikit-learn's gridsearch to systematically search for optimal # hyperparameter values. Else, we use hyperparameter values set by the user to construct and # train surrogate models for each performance variable. if gridsearch: # construct a surrogate model for each output parameter (performance metric) print "My God... They're learning..." for n, target_parameter in enumerate(Y_train_scaled): print "\n------------------------" print target_parameter print "------------------------" if DEBUG: print Y_train_scaled[target_parameter] model = generate_optimized_surrogate( X_train_scaled, Y_train_scaled[target_parameter], label=target_parameter, C_range=C_range, epsilon_range=epsilon_scale, grid_iter=optimize_iter, scoring=model_scoring) surrogate_models.append(model) else: for n, target_parameter in enumerate(Y_train_scaled): print "\n------------------------" print target_parameter print "------------------------" model = SVR(kernel='rbf', C=C_tuple[n], epsilon=epsilon_tuple[n], gamma='auto').fit(X_train_scaled, Y_train_scaled[target_parameter]) surrogate_models.append(model) print "\nSurrogate models:\n", surrogate_models """ print np.shape(surrogate_model) print surrogate_model # make predictions over the output surrogate data. #prediction_outputs = [model.predict(X_train_scaled) for model in surrogate_model] prediction_outputs = surrogate_model[1].predict(X_train_scaled) print np.shape(prediction_outputs) print prediction_outputs """ # If the sampled data was split into training and testing sets, evaluate the generated models # on the testing data. Otherwise, compute cross-validated scores using the training data. # First, instantiate a list to hold our scaler (transformation) objects to transform the values # predicted by the models to the range of the performance metrics being modeled. Y_scalers = [] for n, model in enumerate(surrogate_models): print "\n------------------------" print targets[n] print "------------------------" if test_split > 0.0: print "\n=========================" print "MODEL EVALUATION" print "=========================" predictions = model.predict(X_test_scaled) target_values = Y_test[targets[n]] # reverse-transform the outputs and predictions back to their original values Y_test_scaler = preprocessing.MinMaxScaler().fit( Y_test[targets[n]].reshape(-1, 1)) predictions = Y_test_scaler.inverse_transform( predictions.reshape(-1, 1)) #print Y_test[:,n] #print predictions #result_array = np.column_stack((Y_test[:,n].reshape(-1,1), predictions)) print "test values, predicted values" print target_values, predictions print "model score:", metrics.mean_squared_error( target_values, predictions) #print "model score: ", model.score(target_values, predictions) print "model parameters:" parameters = model.get_params() print ' C: ', parameters['C'] print ' epsilon: ', parameters['epsilon'] #print ' gamma: ', parameters['gamma'] # If a testing set was not set aside, use Leave-One-Out (LOO) cross-validation else: scaled_target_values = Y_train_scaled[targets[n]].values target_values = Y_train[targets[n]].values scores = cross_validation.cross_val_score( model, X_train_scaled.values, scaled_target_values, scoring='mean_squared_error', cv=len(Y_train_scaled)) avg_score = np.mean(scores) score_std = np.std(scores) print "model avg score: %1.5f (+/-%1.5f)" % (-avg_score, score_std) predictions = cross_validation.cross_val_predict( model, X_train_scaled.values, scaled_target_values, cv=len(Y_train_scaled)) # Make a scaler and inverse transform the predictions back to their original, unscaled ranges Y_test_scaler = preprocessing.MinMaxScaler().fit(target_values) predictions = Y_test_scaler.inverse_transform(predictions) Y_scalers.append(Y_test_scaler) print "Y_scalers[%d]: " % n, Y_scalers[n] # plot the predicted vs actual values fig, ax = plt.subplots() ax.scatter(predictions, target_values, marker='x') ax.plot(target_values, target_values, c='b', linestyle='--') ax.set_xlabel("Predicted Values") ax.set_ylabel("Actual Values") ax.set_title("Predicted vs Actual Target Values: %s" % targets[n]) fig.savefig('%s%s_%s_predicted_vs_actual.png' % (output_directory, data_title, targets[n])) """ if test_split > 0.0: print "\n=========================" print "MODEL EVALUATION" print "=========================" # step through each model and evaluate its performance on the testing data for n, model in enumerate(surrogate_models): print "\n------------------------" print targets[n] print "------------------------" predictions = model.predict(X_test_scaled) target_values = Y_test[targets[n]] # reverse-transform the outputs and predictions back to their original values Y_test_scaler = preprocessing.MinMaxScaler().fit(Y_test[targets[n]].reshape(-1,1)) predictions = Y_test_scaler.inverse_transform(predictions.reshape(-1,1)) #print Y_test[:,n] #print predictions #result_array = np.column_stack((Y_test[:,n].reshape(-1,1), predictions)) print "test values, predicted values" print target_values, predictions print "model score:", metrics.mean_squared_error(target_values, predictions) #print "model score: ", model.score(target_values, predictions) print "model parameters:" parameters = model.get_params() print ' C: ', parameters['C'] print ' epsilon: ', parameters['epsilon'] #print ' gamma: ', parameters['gamma'] # plot the predicted vs actual values fig, ax = plt.subplots() ax.scatter(predictions, target_values, marker = 'x') ax.plot(target_values, target_values, c='b', linestyle='--') ax.set_xlabel("Predicted Values") ax.set_ylabel("Actual Values") ax.set_title("Predicted vs Actual Target Values: %s" %targets[n]) fig.savefig('%s%s_predicted_vs_actual.png' %(output_directory, targets[n])) else: print "\n=========================" print "MODEL CROSS-VALIDATION" print "=========================" # Use cross-validation to evaluate the models created above for n, model in enumerate(surrogate_models): print "\n------------------------" print targets[n] print "------------------------" scaled_target_values = Y_train_scaled[targets[n]].values target_values = Y_train[targets[n]].values scores = cross_validation.cross_val_score(model, X_train_scaled.values, scaled_target_values, scoring = 'mean_squared_error', cv = len(Y_train_scaled)) avg_score = np.mean(scores) score_std = np.std(scores) print "model avg score: %1.5f (+/-%1.5f)" %(-avg_score, score_std) predictions = cross_validation.cross_val_predict(model, X_train_scaled.values, scaled_target_values, cv = len(Y_train_scaled)) # Make a scaler and inverse transform the predictions back to their original, unscaled ranges Y_test_scaler = preprocessing.MinMaxScaler().fit(target_values) predictions = Y_test_scaler.inverse_transform(predictions) # plot the predicted vs actual values fig, ax = plt.subplots() ax.scatter(predictions, target_values, marker = 'x') ax.plot(target_values, target_values, c='b', linestyle='--') ax.set_xlabel("Predicted Values") ax.set_ylabel("Actual Values") ax.set_title("Predicted vs Actual Target Values: %s" %targets[n]) fig.savefig('%s%s_predicted_vs_actual.png' %(output_directory, targets[n])) """ if save_models is True: model_file = data_title + "_surrogate_models.pkl" input_scaler_file = data_title + "_input_scalers.pkl" scaler_file = data_title + "_datascalers.pkl" models_savefile = output_directory + model_file input_scalers_savefile = output_directory + input_scaler_file scalers_savefile = output_directory + scaler_file #models_savefile = "%s%s_surrogate_models.pkl" %(output_directory, data_name) #scalers_savefile = "%s%s_datascalers.pkl" %(output_directory, data_name) with open(models_savefile, 'w') as f: pickle.dump(surrogate_models, f) with open(input_scalers_savefile, 'w') as f: pickle.dump(X_train_scaler, f) with open(scalers_savefile, 'w') as f: pickle.dump(Y_scalers, f) return surrogate_models, Y_scalers
dataset = pd.read_csv('ReceivingTimes.csv') X = dataset.iloc[:,:-1].values y = dataset.iloc[:,4].values # Using Skicit-learn to split data into training and testing sets from sklearn.model_selection import train_test_split # Split the data into training and testing sets xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.25, random_state = 42) from sklearn.svm import SVR regressor=SVR(kernel='linear',degree=1) regressor.fit(xtrain, ytrain) # Look at parameters used by our current forest print('Parameters currently in use:\n') regressor.get_params() from sklearn.model_selection import RandomizedSearchCV # Number of trees in random forest C = [0.001, 0.01, 0.1,1,10] # Number of features to consider at every split kernel = ['linear', 'poly'] # Maximum number of levels in tree epsilon = [0.001, 0.01, 0.1,1,10] gamma = [0.001, 0.01, 0.1,1,10] # Create the random grid random_grid = {'C': C, 'kernel': kernel, 'epsilon': epsilon, 'gamma': gamma}
ytr.reset_index(inplace=True) ytr.drop(['index'], axis = 1, inplace=True) X = ly_test['date'] retry = ytr['surge'] # surge in training data horizontal = ytr['date'] # date in training data # attempt support vector regression svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1) svr_lin = SVR(kernel='linear', C=100, gamma='auto') svr_poly = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=.1, coef0=1) # Tune the hyperparameters svr_rbf.get_params() svr_params = {'kernel': ['rbf'], 'C': [0.1,1,10,20,50], 'gamma':[1, 0.1, 0.01, 0.001]} tune = GridSearchCV(SVR(), svr_params, cv=5) tune.fit(lx_norm_train,ly_train['surge']) tune.cv_results_ print("Best score: ", tune.best_score_) #0.727 (2-yr. data) print("Best parameters: ", tune.best_params_) # Try with the best parameters (2-yr data) #svr_rbf = SVR(kernel='rbf', C=1, gamma=0.001) # Best parameters for cuxhaven.de (~5yr. data) # score: 0.831, Best parameters: {'C': 10, 'gamma': 0.001 #svr_rbf = SVR(kernel='rbf', C=10, gamma=0.001)
# regressor = SVR(kernel='rbf',C=5.0, epsilon=0.1) regressor = SVR() regressor.fit(XTrain, yTrain) # Calculate errors yTestPredict = regressor.predict(XTest) mse = mean_squared_error(yTest, yTestPredict, squared=True) rmse = mean_squared_error(yTest, yTestPredict, squared=False) mae = mean_absolute_error(yTest, yTestPredict) mape = mean_absolute_percentage_error(yTest, yTestPredict) print("The mean squared error (MSE) on test set: {:.4f}".format(mse)) print("The root Mean Square Error (RMSE) on test set: {:.4f}".format(rmse)) print("The mean absolute error on test set: {:.4f}".format(mae)) print("The mean absolute percentage error on test set: {:.4f}".format(mape)) print(regressor.get_params(deep=True)) # plt.plot(degreeGrid, mseValues, color='blue') # plt.xlabel('degree values') # plt.ylabel('Mean square error values') # plt.title('kernel = poly') # plt.show() # # plt.plot(degreeGrid, rmseValues, color='red') # plt.xlabel('degree values') # plt.ylabel('Root mean square error values') # plt.title('kernel = poly') # plt.show() # # plt.plot(degreeGrid, maeValues, color='green') # plt.xlabel('degree values')
count = 0 for i in range(len(predictiveAttributeNotDegree)): if count < train_percent: count = count + 1 train_set_tot.append([predictiveAttributeNotDegree[i][0], predictiveAttributeNotDegree[i][1], predictiveAttributeNotDegree[i][6], predictiveAttributeNotDegree[i][7], predictiveAttributeNotDegree[i][8], predictiveAttributeNotDegree[i][9], predictiveAttributeNotDegree[i][10], predictiveAttributeNotDegree[i][11], predictiveAttributeNotDegree[i][12], predictiveAttributeNotDegree[i][13], predictiveAttributeNotDegree[i][17]]) train_result_tot.append([predictiveAttributeNotDegree[i][2]]) else: test_set_tot.append([predictiveAttributeNotDegree[i][0], predictiveAttributeNotDegree[i][1], predictiveAttributeNotDegree[i][6], predictiveAttributeNotDegree[i][7], predictiveAttributeNotDegree[i][8], predictiveAttributeNotDegree[i][9], predictiveAttributeNotDegree[i][10], predictiveAttributeNotDegree[i][11], predictiveAttributeNotDegree[i][12], predictiveAttributeNotDegree[i][13], predictiveAttributeNotDegree[i][17]]) test_result_tot.append([predictiveAttributeNotDegree[i][2]]) train_result_tot = np.array(train_result_tot) svm_reg_tot.fit(train_set_tot, train_result_tot.ravel()) print("----ALL ATTRIBUTE: score: ", svm_reg_tot.score(test_set_tot, test_result_tot)) prediction = [] for item in test_set_tot: items = [[item[0], item[1], item[2], item[3], item[4], item[5], item[6], item[7], item[8], item[9], item[10]]] prediction.append(svm_reg_tot.predict(items)) pred = np.zeros(len(prediction)) predi = np.array(prediction) for i in range(len(prediction)): pred[i] = predi[i][0] print(("MSE: {}".format(mean_squared_error(pred, test_result_tot)))) print("----ALL ATTRIBUTE: Params: ", svm_reg_tot.get_params())