Ejemplo n.º 1
0
class SVR(PlayerModel):
    ### a wrapper for support vector regression using scikit-learn for this project
    def __init__(self):
        PlayerModel.__init__(self)
        # configure support vector regression and start training
        self.regr = SupportVectorRegression(kernel = 'linear', C = 1000)
        self.regr.fit(self.dataset_X_train, self.dataset_Y_train)
        print "Finish building player model."
        print "Parameters: ", self.regr.get_params()
        print "============================================================"

    def testScore(self, test_X):
        score = self.regr.predict(self.normalizeTest(test_X))
        return np.mean(score)

    def getParams(self):
        return self.regr.get_params()

    def visualize(self):
        x = np.zeros((10, self.col - 1))
        mean = self.dataset_X_train.mean(0)
        for i in range(10):
            x[i, :] = mean
        x[:, 0:1] = np.array([np.arange(0.0, 1.1, 0.11)]).T
        # print x
        y = self.regr.predict(x)
        # print y
        pyplot.scatter(self.dataset_X_train[:, 0:1], self.dataset_Y_train, c='k', label='data')
        pyplot.hold('on')
        pyplot.plot(x[:, 0:1], y, c = "r", label='Support Vector Regression')
        pyplot.xlabel('data collect from player')
        pyplot.ylabel('score')
        pyplot.title('Support Vector Regression')
        pyplot.legend()
        pyplot.show()
Ejemplo n.º 2
0
 def SVRegresion(self):
     n = len(self.training_u)  # the number of data
     self.training_u = self.training_u.reshape((n, 1))  # the requirement of SVR
     self.training_x = self.training_x.reshape((n,))
     self.training_y = self.training_y.reshape((n,))
     self.vali_u = self.vali_u.reshape((len(self.vali_u), 1))
     svr_rbf = SVR(kernel='rbf', C=self.C, gamma=self.gamma,
                   epsilon=self.epsilon)  # https://en.wikipedia.org/wiki/Radial_basis_function
     self.predict_x = svr_rbf.fit(self.training_u, self.training_x).predict(self.vali_u)
     self.predict_y = svr_rbf.fit(self.training_u, self.training_y).predict(self.vali_u)
     print svr_rbf.get_params(deep=True)
     return self.predict_x, self.predict_y
Ejemplo n.º 3
0
def SVRegresion(X, Y, X_vali, C, gamma, epsilon):
    n = len(X)  #the number of data
    X = X.reshape((n, 1))  # the requirement of SVR
    Y = Y.reshape((n, ))
    X_vali = X_vali.reshape((len(X_vali), 1))
    #mean = sum(X*Y)/n                   #note this correction
    #sigma = sum(Y*(X-mean)**2)/n
    print np.shape(X), np.shape(Y)
    svr_rbf = SVR(
        kernel='rbf', C=C, gamma=gamma,
        epsilon=epsilon)  #https://en.wikipedia.org/wiki/Radial_basis_function
    Y_rbf = svr_rbf.fit(X, Y).predict(X_vali)
    print svr_rbf.get_params(deep=True)
    #print Y_rbf
    return Y_rbf
Ejemplo n.º 4
0
def svr():
    if request.method == 'POST':
        #load data
        x = np.load("x_storage.npy")
        y = np.load("y_storage.npy")
        time_set = np.load("time_storage.npy")
        #regression
        svr_rbf = SVR(kernel='rbf',
                      C=float(request.form['C']),
                      gamma=float(request.form['gamma']))
        svr_rbf.fit(x, y)
        y_rbf = svr_rbf.predict(x)
        #draw picture
        plt.scatter(time_set, y, color='darkorange', label='data')
        plt.plot(time_set, y_rbf, color='navy', lw=2, label='RBF model')
        plt.xlabel('date')
        plt.ylabel('GDP')
        plt.title('Support Vector Regression')
        handles, labels = plt.gca().get_legend_handles_labels()
        by_label = OrderedDict(zip(labels, handles))
        plt.legend(by_label.values(), by_label.keys())
        ax = plt.axes()
        ax.xaxis.set_major_locator(plt.MaxNLocator(9))
        plt.savefig('static/result.png')
        plt.close()
        return render_template('result.html',
                               url='static/result.png',
                               co=svr_rbf.score(x, y),
                               pa=svr_rbf.get_params(),
                               C_default=request.form['C'],
                               gamma_default=request.form['gamma'])
Ejemplo n.º 5
0
class SVRForecaster(AbstractForecaster):
    params_grid = {
        "epsilon": list(range(0, 20, 2)),
        "gamma": ['auto'] + list(range(1, 100, 10))
    }

    def __init__(self, epsilon=5, gamma='auto'):
        self.svr = SVR(epsilon=epsilon, gamma=gamma)

    def _decision_function(self, X):
        return self.svr.predict(X)

    def _train(self, data):
        X, y = data.all_train_data()
        self.svr.fit(X, y)

    def _build(self):
        pass

    def score(self, X, y):
        prediction = self.predict(X)
        return np.sqrt(np.mean(np.square((prediction - y + 0.1) / (y + 0.1))))

    def save(self):
        file_name = self.__class__.__name__ + datetime.datetime.now().strftime(
            "%Y-%m-%d-%H:%M")
        params = self.svr.get_params()
        joblib.dump(params, os.path.join(MODEL_DIR, file_name))
        return os.path.join(MODEL_DIR, file_name)

    @staticmethod
    def load_model(file_name):
        model = SVRForecaster()
        model.svr.set_params(joblib.load(os.path.join(MODEL_DIR, file_name)))
        return model
 def __init__(self, params):
     super(%CLASS%, self).__init__(params)
     tmp = SVR()
     params = tmp.get_params()
     for key in params:
         self.create_new_input(type_="data", label=key, widget_name="std line edit m", widget_pos="besides", pos=-1)
     del tmp
Ejemplo n.º 7
0
    def grid_search(self, params, data_name):
        regr = SVR(kernel='rbf', C=1.0, epsilon=0.2)
        print(regr.get_params().keys())
        min_date, max_date, y, X = SVRCalculator.get_data(data_name)
        X, y, sc_y, next_day, next_week, next_month, sc_X = SVRCalculator.data_normalization(
            max_date, min_date, y, X)
        grid_search = GridSearchCV(estimator=regr,
                                   param_grid=params,
                                   cv=5,
                                   n_jobs=4,
                                   verbose=0,
                                   refit=True)

        grid_search.fit(X, y.values.ravel())
        print("Best parameters for")
        print(data_name)
        print(grid_search.best_params_)
        next_day_prediction = sc_y.inverse_transform(
            grid_search.predict(next_day))
        next_week_prediction = sc_y.inverse_transform(
            grid_search.predict(next_week))
        next_month_prediction = sc_y.inverse_transform(
            grid_search.predict(next_month))

        print("Next day prediction: ", next_day_prediction)
        print("Next week prediction: ", next_week_prediction)
        print("Next month prediction: ", next_month_prediction)

        SVRCalculator.count_errors(y, X, sc_y, grid_search)
Ejemplo n.º 8
0
def svr(term='poly4'):
    """
    Method to load unfitted SVR models of type modelclass

    INPUT:
    term: 'linear', 'poly2' or 'poly4'

    RETURN:
    model
    """
    if term is 'linear':
        regmod = SVR(kernel='linear',
                     gamma='auto_deprecated',
                     C=1.0, epsilon=0.1)
    # SVR with poly kernel
    elif term is 'poly2':
        regmod = SVR(kernel='poly', degree=2,
                     gamma='auto_deprecated',
                     C=1.0, epsilon=0.1)
    # SVR with poly kernel
    elif term is 'poly4':
        regmod = SVR(kernel='poly', degree=4,
                     gamma='auto_deprecated',
                     C=1.0, epsilon=0.1)
    # SVR with rbf kernel
    elif term is 'rbf':
        regmod = SVR(kernel='rbf',
                     gamma='auto_deprecated',
                     C=1.0, epsilon=0.1)
    else:
        raise ValueError('Term unknown')
    utils.display_get_params('SVR Model Description', regmod.get_params())
    return(regmod)
Ejemplo n.º 9
0
 def __init__(self, params):
     super(%CLASS%, self).__init__(params)
     tmp = SVR()
     params = tmp.get_params()
     for key in params:
         self.create_new_output(type_="data", label=key, pos=-1)
     del tmp
     self.create_new_output(type_="data", label="param dict", pos=-1)
Ejemplo n.º 10
0
class sumSVR(object):

    def __init__(self, dim=None,  *args, **kwargs):
        self.dim = dim if dim is not None else 1

        w = kwargs.pop("w", None)

        self.kernel_functions = kwargs.pop("kernel_functions", [])
        if self.kernel_functions is not None:
            self.kernel_kwargs = kwargs.pop("kernel_kwargs", [{} for i in self.kernel_functions])
        else:
            self.kernel_kwargs = []

        kwargs["kernel"] = "precomputed"
        if w is None:
            w = np.ones(dim)

        self.w = w / np.linalg.norm(w)
        self.x = kwargs.pop('x', None)



        self.SVR = SVR(*args, **kwargs)

    def fit(self, x, y):
        self.x = x
        kernel_train = np.zeros((x.shape[0], x.shape[0]))
        for i in range(self.dim):
            x_i = x[:,i]
            kernel_i = self.kernel_functions[i](x_i, **self.kernel_kwargs[i])
            kernel_train += self.w[i] * kernel_i

        self.SVR.fit(kernel_train,y)

    def predict(self, x):
        kernel_test = np.zeros((x.shape[0], self.x.shape[0]))
        for i in range(self.dim):
            x_i = x[:,i]
            tr_i = self.x[:,i]
            kernel_i = self.kernel_functions[i](x_i, tr_i, **self.kernel_kwargs[i])
            kernel_test += self.w[i] * kernel_i

        return self.SVR.predict(kernel_test)

    def get_params(self, deep=False):
        params = self.SVR.get_params()
        params['dim'] = self.dim
        params['w'] = self.w
        params['kernel_functions'] = self.kernel_functions
        params['kernel_kwargs'] = self.kernel_kwargs
        params['x'] = self.x
        return params

    def set_params(self, **params):
        self.__init__(**params)
        return self
Ejemplo n.º 11
0
    def test_parameters(self):
        """ Testing parameters of Model class. """
#1.)
        #create instance of PLS model using Model class & creating instance
        #   using SKlearn libary, comparing if the parameters of both instances are equal
        pls_parameters = {"n_components": 20, "scale": False, "max_iter": 200}
        model = Model(algorithm="PlsRegression", parameters=pls_parameters)
        pls_model = PLSRegression(n_components=20, scale="svd", max_iter=200)

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(pls_model.get_params()))
#2.)
        rf_parameters = {"n_estimators": 200, "max_depth": 50,"min_samples_split": 10}
        model = Model(algorithm="RandomForest", parameters=rf_parameters)
        rf_model = RandomForestRegressor(n_estimators=200, max_depth=50, min_samples_split=10)

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(rf_model.get_params()))
#3.)
        knn_parameters = {"n_neighbors": 10, "weights": "distance", "algorithm": "ball_tree"}
        model = Model(algorithm="KNN", parameters=knn_parameters)
        knn_model = KNeighborsRegressor(n_neighbors=10, weights='distance', algorithm="kd_tree")

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(knn_model.get_params()))
#4.)
        svr_parameters = {"kernel": "poly", "degree": 5, "coef0": 1}
        model = Model(algorithm="SVR",parameters=svr_parameters)
        svr_model = SVR(kernel='poly', degree=5, coef0=1)

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(svr_model.get_params()))
#5.)
        ada_parameters = {"n_estimators": 150, "learning_rate": 1.2, "loss": "square"}
        model = Model(algorithm="AdaBoost", parameters=ada_parameters)
        ada_model = AdaBoostRegressor(n_estimators=150, learning_rate=1.2, loss="square")

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(ada_model.get_params()))
#6.)
        bagging_parameters = {"n_estimators": 50, "max_samples": 1.5, "max_features": 2}
        model = Model(algorithm="Bagging", parameters=bagging_parameters)
        bagging_model = BaggingRegressor(n_estimators=50, max_samples=1.5, max_features="square")

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(bagging_model.get_params()))
#7.)
        lasso_parameters = {"alpha": 1.5, "max_iter": 500, "tol": 0.004}
        model = Model(algorithm="lasso", parameters=lasso_parameters)
        lasso_model = Lasso(alpha=1.5, max_iter=500, tol=0.004)

        for k, v in model.model.get_params().items():
            self.assertIn(k, list(lasso_model.get_params()))
Ejemplo n.º 12
0
class SVM():
    def __init__(self, task='cls', **kwargs):
        if task == 'cls':
            self.svm = SVC(**kwargs)
            self._name = 'SVC'
        elif task == 'prd':
            self.svm = SVR(**kwargs)
            self._name = 'SVR'

    def decision_function(self, X):
        '''
            X (n_samples, n_features)
            return:  X (n_samples, n_classes * (n_classes-1) / 2)
        '''
        if self._name == 'SVC':
            return self.svm.decision_function(X)

    def fit(self, X, y, sample_weight=None):
        '''
            X (n_samples, n_features)
            y (n_samples,)
            sample_weight (n_samples,)
        '''
        return self.svm.fit(X, y, sample_weight)

    def get_params(self, deep=True):
        return self.svm.get_params(deep)

    def predict(self, X):
        return self.svm.predict(X)

    def score(self, X, y, sample_weight=None):
        '''
            X (n_samples, n_features)
            y (n_samples,) or (n_samples, n_outputs)
            sample_weight (n_samples,), default=None
        '''
        return self.svm.score(X, y, sample_weight)

    def set_params(self, **params):
        '''
            **params dict
        '''
        return self.svm.set_params(**params)
Ejemplo n.º 13
0
class SVM():

    num_amostra_treinamento = None
    num_amostra_teste = None
    mae_treinamento = None
    mae_teste = None

    def __init__(self, gamma_, C_, epsilon_):
        self.svm = SVR(kernel="rbf", gamma=gamma_, C=C_, epsilon=epsilon_)

    def treinar(self, X, Y):
        self.num_amostra_treinamento = X.shape[0]
        self.svm.fit(X, Y)
        resp = self.svm.predict(X)
        self.mae_treinamento = mean_absolute_error(Y, resp)

    def testar(self, X, Y):
        self.num_amostra_teste = X.shape[0]
        resp = self.svm.predict(X)
        self.mae_teste = mean_absolute_error(Y, resp)

    def imprimir(self):
        params = self.svm.get_params()

        print("SVM com núcleo RBF")
        print("C = %f" % params['C'])
        print("Gamma = %f" % params['gamma'])
        print("Epsilon = %f" % params['epsilon'])

        print("Conjunto de Treinamento: %d amostras (%.2f%%)" %
              (self.num_amostra_treinamento,
               ((100.0 /
                 (self.num_amostra_treinamento + self.num_amostra_teste)) *
                self.num_amostra_treinamento)))
        print("MAE Treinamento: %f" % self.mae_treinamento)
        print("Conjunto de Teste: %d amostras (%.2f%%)" %
              (self.num_amostra_teste,
               ((100.0 /
                 (self.num_amostra_treinamento + self.num_amostra_teste)) *
                self.num_amostra_teste)))
        print("MAE Teste: %f" % self.mae_teste)
Ejemplo n.º 14
0
    imax = np.argmin(mses)



    #fitter = AdaBoostRegressor(n_estimators=50)
    #fitter = gaussian_process.GaussianProcess()
    #fitter = LinearRegression()





    fitter2 = SVR(kernel='rbf',C=cs[imax])
    tec_validate_fit = fitter2.fit(data_train,tec_train).predict(data_validate)

    print fitter.get_params(deep=True)
    #coefs = fitter.coef_
    #print abs(coefs[0:6]).sum()
    #print abs(coefs[6:12]).sum()
    #print abs(coefs[12:18]).sum()
    #print coefs[-1]

    #MSE: 
    mse = np.mean((tec_validate_fit-tec_validate)**2)
    #print "smse",np.sqrt(mse)
    #print fitter.coef_


    #plot 
    import matplotlib.pyplot as plt
Ejemplo n.º 15
0
from sklearn.svm import SVR
import numpy as np

filename = '2004_2009f.csv'

puredata = np.loadtxt(filename, delimiter=',')
X = puredata[:, 1:]
Y = puredata[:, 0]

svr = SVR()
print "fitting"
svr.fit(X, Y)
print "prediction"
y_pred = svr.predict(X)
list = []
for i in range(len(X)):
    #print Y[i],y_pred[i],i
    list.append(y_pred[i] - Y[i])

print "Number of tuples: ", len(X)
print "Mean of predictions : ", np.mean(y_pred)
print "Standard deviation : ", np.std(list, ddof=1)
print svr.get_params(deep=True)
Ejemplo n.º 16
0
class Baseline:
    def __init__(self, city, dest_name):
        self.city = city
        self.dest_name = dest_name
        print 'Baseline implementation for {:s} : {:s}'.format(
            self.city, self.dest_name)
        dest_to_idx = {
            'bofa': 0,
            'church': 1,
            'gas_station': 3,
            'high_school': 3,
            'mcdonalds': 4
        }
        self.idx = dest_to_idx[self.dest_name]
        self.base_dir = osp.join('../data/dataset', city)
        self.train_label_filename = osp.join(self.base_dir, 'distance',
                                             'train_labels.txt')
        self.train_im_list_filename = osp.join(self.base_dir, 'distance',
                                               'train_im_list.txt')
        self.test_label_filename = osp.join(self.base_dir, 'distance',
                                            'test_labels.txt')
        self.test_im_list_filename = osp.join(self.base_dir, 'distance',
                                              'test_im_list.txt')
        self.svr = SVR(kernel='linear',
                       shrinking=False,
                       cache_size=10000,
                       verbose=True)
        # self.svr = LinearSVR(verbose=1)

    def collect_train_data_parallel(self):
        with open(self.train_im_list_filename, 'r') as train_f_im,\
            open(self.train_label_filename, 'r') as train_f_label:
            train_im_names = [
                osp.join('../data/dataset',
                         l.rstrip().split(' ')[0]) for l in train_f_im
            ]
            train_labels = [
                float(l.rstrip().split(' ')[self.idx]) for l in train_f_label
            ]

        # get dims
        ge = GISTExtractor(width=256, height=256)
        im = cv2.imread(train_im_names[0])
        gist_features = ge.extract_gist(im)
        self.train_X = np.zeros((len(train_im_names), gist_features.shape[0]),
                                dtype=np.float)
        self.train_y = np.asarray(train_labels)

        # parallel feature extraction!
        print 'Collecting training data'
        pool = Pool(initializer=pool_init, initargs=(256, 256))
        chunksize = len(train_im_names) / 4
        for idx, feat in enumerate(
                pool.imap(gist_wrapper, train_im_names, chunksize)):
            self.train_X[idx, :] = feat

        pool.close()
        pool.join()

    def collect_train_data_serial(self):
        with open(self.train_im_list_filename, 'r') as train_f_im,\
            open(self.train_label_filename, 'r') as train_f_label:
            train_im_names = [
                osp.join('../data/dataset',
                         l.rstrip().split(' ')[0]) for l in train_f_im
            ]
            train_labels = [
                float(l.rstrip().split(' ')[self.idx]) for l in train_f_label
            ]

        # get dims
        ge = GISTExtractor(width=256, height=256)
        im = cv2.imread(train_im_names[0])
        gist_features = ge.extract_gist(im)
        self.train_X = np.zeros((len(train_im_names), gist_features.shape[0]),
                                dtype=np.float)
        self.train_y = np.asarray(train_labels)

        db = lmdb.open('../data/dataset/gist',
                       map_size=int(1e12),
                       readonly=True)
        txn = db.begin()

        # serial feature extraction!
        print 'Collecting training data'
        for idx, im_name in enumerate(train_im_names):
            if idx % 100 == 0:
                print 'Image {:d} / {:d}'.format(idx, len(train_im_names))
            key = get_key(im_name)
            self.train_X[idx, :] = np.fromstring(txn.get(key))

    def collect_test_data_parallel(self):
        with open(self.test_im_list_filename, 'r') as test_f_im,\
            open(self.test_label_filename, 'r') as test_f_label:
            test_im_names = [
                osp.join('../data/dataset',
                         l.rstrip().split(' ')[0]) for l in test_f_im
            ]
            test_labels = [
                float(l.rstrip().split(' ')[self.idx]) for l in test_f_label
            ]

        # get dims
        ge = GISTExtractor(width=256, height=256)
        im = cv2.imread(test_im_names[0])
        gist_features = ge.extract_gist(im)
        self.test_X = np.zeros((len(test_im_names), gist_features.shape[0]),
                               dtype=np.float)
        self.test_y = np.asarray(test_labels)

        # parallel feature extraction!
        print 'Collecting testing data'
        pool = Pool(initializer=pool_init, initargs=(256, 256))
        chunksize = len(test_im_names) / 4
        for idx, feat in enumerate(
                pool.imap(gist_wrapper, test_im_names, chunksize)):
            self.test_X[idx, :] = feat
        pool.close()
        pool.join()

    def collect_test_data_serial(self):
        with open(self.test_im_list_filename, 'r') as test_f_im,\
            open(self.test_label_filename, 'r') as test_f_label:
            test_im_names = [
                osp.join('../data/dataset',
                         l.rstrip().split(' ')[0]) for l in test_f_im
            ]
            test_labels = [
                float(l.rstrip().split(' ')[self.idx]) for l in test_f_label
            ]

        # get dims
        ge = GISTExtractor(width=256, height=256)
        im = cv2.imread(test_im_names[0])
        gist_features = ge.extract_gist(im)
        self.test_X = np.zeros((len(test_im_names), gist_features.shape[0]),
                               dtype=np.float)
        self.test_y = np.asarray(test_labels)

        db = lmdb.open('../data/dataset/gist',
                       map_size=int(1e12),
                       readonly=True)
        txn = db.begin()

        # serial feature extraction!
        print 'Collecting testing data'
        for idx, im_name in enumerate(test_im_names):
            if idx % 100 == 0:
                print 'Image {:d} / {:d}'.format(idx, len(test_im_names))
            key = get_key(im_name)
            self.test_X[idx, :] = np.fromstring(txn.get(key))

    def train(self, C=1.0, calc_loss=False):
        print 'Training with C = {:f}'.format(C)
        p = self.svr.get_params()
        p['C'] = C
        self.svr.set_params(**p)
        self.svr.fit(self.train_X, self.train_y)
        loss = 0
        if calc_loss:
            test_y_pred = self.svr.predict(self.test_X)
            loss = np.linalg.norm(test_y_pred - self.test_y)
            # score = self.svr.score(self.test_X, self.test_y)
            print 'Loss = {:f}'.format(loss)
        return loss

    def cross_validate(self):
        C = np.power(10.0, xrange(-2, 5))
        losses = np.array([self.train(c, calc_loss=True) for c in C])
        idx = np.argmin(losses)
        print 'Best C = {:f}'.format(C[idx])

    def save_current_model(self):
        model_filename = osp.join(self.base_dir, 'distance',
                                  '{:s}.pkl'.format(self.dest_name))
        joblib.dump(self.svr, model_filename)
        print model_filename, 'saved'
Ejemplo n.º 17
0
def learn_models(model_names, features_to_use):
    """
    This version splits original texts in dataset for evaluating summaries
    """
    dataset_features = utilities.load_features('CNN')
    features, targets, labels, documents, all_vec = utilities.split_dataset(
        dataset_features, features_to_use, 0.28, 'CNN')
    #return utilities.write_dataset_csv(dataset_features, '/tmp/test.csv')
    '''
    cPickle.dump(features, open('features.pkl', 'wb'))
    cPickle.dump(targets, open('targets.pkl', 'wb'))
    cPickle.dump(labels, open('labels.pkl', 'wb'))
    cPickle.dump(documents, open('documents.pkl', 'wb'))
    cPickle.dump(all_vec, open('all_vec.pkl', 'wb'))

    
    features = cPickle.load(open('features.pkl', 'rb'))
    targets = cPickle.load(open('targets.pkl', 'rb'))
    labels = cPickle.load(open('labels.pkl', 'rb'))
    documents = cPickle.load(open('documents.pkl', 'rb'))
    all_vec = cPickle.load(open('all_vec.pkl', 'rb'))
    '''

    X_normal = np.array(all_vec)
    #X_normal = utilities.select_features(features_to_use, X_normal)
    # X_normal = StandardScaler().fit_transform(dataset[0])

    utilities.normalize_dataset(X_normal, features_to_use, 'learn')

    X_train = np.array(features['train'])
    X_test = np.array(features['test'])
    y_train = np.array(targets['train'])
    y_test = np.array(targets['test'])
    labels_train = np.array(labels['train'])
    labels_test = np.array(labels['test'])

    #X_train = utilities.select_features(features_to_use, X_train)
    #utilities.normalize_dataset(X_train, features_to_use)

    #X_test = utilities.select_features(features_to_use, X_test)
    #utilities.normalize_dataset(X_test, features_to_use)

    print("Dataset size: {}".format(len(all_vec)))

    #(X_balanced, y_balanced, labels_balanced) = (X_train, y_train, labels_train)
    X_balanced, y_balanced, labels_balanced = utilities.balance_dataset(
        X_train, y_train, labels_train, 1)
    print("Used features: " + ','.join(features_to_use))
    print("Train set size: {}".format(len(X_balanced)))
    print("Number of True/False labels: {}/{}".format(
        sum(labels_balanced), sum(1 for i in labels_balanced if not i)))
    print("Test set size: {}".format(len(X_test)))
    print("Number of True/False labels: {}/{}".format(
        sum(labels_test), sum(1 for i in labels_test if not i)))
    print("Used features: {}".format(len(X_balanced[0])))

    dataset_json = json.loads(
        utilities.read_file('resources/CNN/documents.json'))
    test_documents = {int(key): dataset_json[key] for key in documents['test']}
    is_regressor = True
    for model_type in model_names:
        print('**********************' + model_type + '**********************')
        if model_type == 'dtr':
            # max_depth=6
            regr = tree.DecisionTreeRegressor(criterion='friedman_mse')
            regr = regr.fit(X_balanced, y_balanced)
            print(regr.get_params())
            export_name = 'dtr'
        elif model_type == 'linear':
            regr = linear_model.LinearRegression()
            # Train the model using the training sets
            regr.fit(X_balanced, y_balanced)
            # The coefficients
            print('Coefficients: \n', regr.coef_)
            export_name = 'linear'
        elif model_type == 'svm':
            regr = SVR(kernel='rbf',
                       degree=7,
                       verbose=False,
                       epsilon=0.000001,
                       gamma='scale',
                       tol=.0000001,
                       shrinking=True)
            # Train the model using the training sets
            regr.fit(X_balanced, y_balanced)
            # The coefficients
            print('Coefficients: \n', regr.get_params())
            export_name = 'svm'
        elif model_type == 'dummy':
            regr = RndRegressor()
            export_name = 'dummy'
            is_regressor = False
        elif model_type == 'ideal':
            from IdealRegressor import IdealRegressor
            regr = IdealRegressor(X_train, y_train)
            #regr.predict(X_train)
            regr.fit(X_test, y_test)
            #regr.predict(X_test)
            export_name = 'ideal'
        elif model_type == 'nb':
            #from sklearn import svm
            #regr = svm.SVC(gamma='scale').fit(X_train, labels_train)
            from sklearn.naive_bayes import ComplementNB, GaussianNB
            from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
            from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

            regr = ComplementNB(alpha=0.015)
            regr.fit(X_train, labels_train)
            is_regressor = False
            export_name = 'nb'
        else:
            print("Regression type is undefined:" + model_type)
            continue
        # Make predictions using the testing set

        model_results = Learn.evaluate_model(regr, X_test, X_balanced, y_test,
                                             y_balanced, labels_test,
                                             labels_balanced, is_regressor)

        print('Summarizing dataset and evaluating Rouge...')

        rouge_scores = evaluate_summarizer(regr, test_documents,
                                           features_to_use, True)
        utilities.print_rouges(rouge_scores)
        utilities.export_model(regr, export_name)
        print(
            '*****************************************************************************'
        )
    return rouge_scores, model_results
Ejemplo n.º 18
0
svm_reg.fit(fires_prepared, fires_labels)

#dt
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg.fit(fires_prepared, fires_labels)

#rf
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(fires_prepared, fires_labels)

#2-1
from sklearn.model_selection import GridSearchCV
print("sgd_reg.get_params().keys(): ", sgd_reg.get_params().keys())
print("svm_reg.get_params().keys(): ", svm_reg.get_params().keys())
print("tree_reg.get_params().keys(): ", tree_reg.get_params().keys())
print("forest_reg.get_params().keys(): ", forest_reg.get_params().keys())

params_sgd = [
    {
        'alpha': [0.1, 0.5],
        'epsilon': [0.1, 1]
    },
    {
        'alpha': [0.5, 0.6],
        'epsilon': [0.1, 0.7]
    },
]

params_svm = {
Ejemplo n.º 19
0
print (np.sum((y_pred - y)** 2)/len(X))
print clf.score(X,y)
# print clf.best_estimator_
# print clf.best_score_
# print clf.best_params_
# print clf.cv_results_

print clf.support_
print clf.support_vectors_
# print clf.coef_
# y_pred = clf.predict(X)
# print y
# print y_pred
# print (np.sum((y_pred - y)** 2)/len(X))
# print clf.score(X,y)
print clf.get_params()
# joblib.dump(clf, "rbf_SVR_100k_4_111.pkl")

data, nrows, ncols = readDataSet("YearPredictionMSDTest10.txt")
X = data[:,1:91]
y = data[:,0]

clfp = PCA(n_components = 4)
X = clfp.fit_transform(X) 

X = StandardScaler().fit_transform(X)
y_pred = clf.predict(X)
print y_pred, y
print (np.sum((y_pred - y)** 2)/len(X))
print clf.score(X,y)
Ejemplo n.º 20
0
class Trainer():

	def __init__(self):
		
		with open('credentials.json') as credentials_file:
		    credentials = json.load(credentials_file)

		passwd = credentials['mysql']['password']
		self.con = mdb.connect(host='127.0.0.1', port=3306, user='******', passwd=passwd, db='insight', autocommit=True)
		self.cur = self.con.cursor()
		print "Connected to database"
		
		self.load_data()

	def load_data(self):
		f = open('./pickles/mysql_dump.pickle', 'rb')
		self.loanData = pickle.load(f)
		self.loanData = pd.DataFrame(self.loanData)
		f.close()

	def drop_na(self):
		self.loanData = loanData.dropna()
		self.loanData.index = range(len(self.loanData))

	def drop_columns(self):
		#drop the columns with malformed data in mysql db
		self.loanData = self.loanData.drop(['none',
											'educational',
											'IA',
											'IDAHO',
											'ME',
											'NE',
											'other_housing',
											'issue_year'], 1)

	def drop_prepaid_loans(self):
		indices_to_drop = []
		for i in range(len(self.loanData)):
			if self.loanData['loan_status'][i]==1 and self.loanData['days_to_zero_dollars'][i] < 1000:
				indices_to_drop.append(i)
		self.loanData = self.loanData.drop(indices_to_drop, 0)
		print "Number of prepaid loans: ", len(indices_to_drop)
		print "Number of loans after dropping prepaids: ", len(self.loanData)


	def define_features_targets(self, kind="regression"):
		
		#take out 1000 random loans with 36 month terms for testing
		#ids are already populated in test_loans for consistency
		test_ids = []
		sql_query = "select id from test_loans;"
		self.cur.execute(sql_query)
		sql_resp = self.cur.fetchall()
		print "length of sql response: ", len(sql_resp)
		for val in sql_resp:
			test_ids.append(val[0])
		print "length of test_ids: ", len(test_ids)
		#make the test and train data frames
		self.testLoanData = self.loanData[self.loanData['id'].isin(test_ids)]
		self.trainLoanData = self.loanData[~self.loanData['id'].isin(test_ids)]
		self.testLoanData.index = range(len(self.testLoanData))
		self.trainLoanData.index = range(len(self.trainLoanData))
		print "Train Loan Data: ", len(self.trainLoanData)
		print "Test Loan Data: ", len(self.testLoanData)
		
		self.features = self.trainLoanData.drop(['loan_status', 
											'days_to_zero_dollars',
											'id'], 1)
		self.features = self.features.values
		#choose different target variables for regression vs classification
		if kind == "regression":
			self.targets = self.trainLoanData['days_to_zero_dollars'].values
			self.y_test = self.testLoanData['days_to_zero_dollars'].values
		elif kind == "classification":
			self.targets = self.trainLoanData['loan_status'].values
			self.y_test = self.testLoanData['loan_status'].values

	def preprocess(self):
		(self.X_train, 
		 self.X_cv, 
		 self.y_train, 
		 self.y_cv) = dm.split_train_test(features=self.features, 
		 									targets=self.targets, 
		 									test_size=0.1)
		self.X_test = self.testLoanData.drop(['loan_status', 
											  'days_to_zero_dollars',
											  'id'], 1).values
		(self.X_train, self.X_cv) = dm.standardize_samples(self.X_train, 
														  self.X_cv)
		(self.X_train, self.X_cv) = dm.scale_samples_to_range(self.X_train, 
																self.X_cv)
		(self.X_test, _) = dm.standardize_samples(self.X_test, 
														  self.X_test)
		(self.X_test, _) = dm.scale_samples_to_range(self.X_test, 
																self.X_test)

	def define_dummy_classifier(self):
		self.clf = DummyClassifier()

	def define_rfr(self, n_estimators=10):
		self.regr = RandomForestRegressor(n_estimators=n_estimators, oob_score=True)
		print self.regr.get_params()

	def define_linear_regressor(self):
		self.regr = LinearRegression()
		print self.regr.get_params()

	def define_SVR(self, C=1, gamma=0.1):
		self.regr = SVR(C=C, gamma=gamma, verbose=3)
		print self.regr.get_params()

	def define_logistic_regressor(self, penalty="l2", C=1.0, class_weight=None):
		self.clf = LogisticRegression(penalty=penalty, 
									  C=C, 
									  class_weight=class_weight)
		print self.clf.get_params()

	def define_rfc(self, n_estimators=10):
		self.clf = RandomForestClassifier(n_estimators=n_estimators, oob_score=True)
		print self.clf.get_params()

	def train(self, kind="regression"):
		print "Fitting training data"
		if kind == "regression":
			self.regr.fit(self.X_train, self.y_train)
		elif kind == "classification":
			self.clf.fit(self.X_train, self.y_train)

	def predict(self, X, kind="regression"):
		if kind == "regression":
			self.prediction = self.regr.predict(X)
		elif kind == "classification":
			self.prediction = self.clf.predict(X)

	def score(self, X, y, kind="regression"):
		if kind == "regression":
			score_val = self.regr.score(X, y)
			print "R2 Score: ", score_val
		elif kind == "classification":
			score_val = self.clf.score(X, y)
			print "Accuracy: ", score_val
			print classification_report(y, self.prediction)
			self.precision = precision_score(y, self.prediction, labels=[0,1,2], average=None)
			print "\n\nPrecision Score: ", self.precision, "\n\n"
			self.accuracy = accuracy_score(y, self.prediction)

	def test(self, kind="regression"):
		#run clf and regr on the test data to determine to top 100 loans
		#the top loans are the ones least likely to default
		if kind == "regression":
			pred = self.regr.predict(self.X_test)
			print "length of regression pred: ", len(pred)
			for i, loan in enumerate(self.testLoanData['id']):
				sql_query = "UPDATE test_loans SET pred_days_to_zero_dollars=%s where id='%s';" %(
						pred[i], self.testLoanData['id'][i])
				self.cur.execute(sql_query)
			print i
		elif kind == "classification":
			pred_proba = self.clf.predict_proba(self.X_test)
			for i, loan in enumerate(self.testLoanData['id']):
				sql_query = "UPDATE test_loans SET pred_default=%s, pred_paid=%s, pred_prepaid=%s where id='%s';" %(
						pred_proba[i][0], pred_proba[i][1],pred_proba[i][2], self.testLoanData['id'][i])
				self.cur.execute(sql_query)
		self.con.close()

	def run_pca(self, n_components=20):
		self.pca = PCA(n_components=n_components)
		self.X_train = self.pca.fit_transform(self.X_train)
		print "Reduced data down to ", self.pca.n_components_, " dimensions: "
		print "Transforming cv data ..."
		self.X_cv = self.pca.transform(self.X_cv)
		print "Transforming test data ..."
		self.X_test = self.pca.transform(self.X_test)

	def plot_prediction(self):
		plt.scatter(self.prediction, self.y_cv)
		plt.xlabel('prediction')
		plt.ylabel('y_test')
		plt.show()

	def runSVRGridSearch(self):
		C_vals = [0.01, 0.1, 1, 10, 100]
		gamma_vals = [1E-2, 1E-1, 1, 1E1, 1E2, 1E3, 1E4]

		for C in C_vals:
			for gamma in gamma_vals:
				print "\n\n C: ", C, "  gamma: ", gamma
				self.define_SVR(C=C, gamma=gamma)
				self.train()
				print "Training Scores:"
				self.predict(self.X_train)
				self.score(self.X_train, self.y_train)
				print "Testing Scores:"
				self.predict(self.X_cv)
				self.score(self.X_cv, self.y_cv)

	def roc(self):
		'''Compute ROC curve using one-vs-all technique'''
		pred_proba = self.clf.predict_proba(self.X_cv)
		fpr = []
		tpr = []
		thresholds = []
		for i in [0, 1, 2]:
			fpr_i, tpr_i, thresholds_i = roc_curve(self.y_cv, pred_proba[:,i], pos_label=i)
			fpr.append(fpr_i)
			tpr.append(tpr_i)
			thresholds.append(thresholds_i)
			print "AUC: ", auc(fpr_i, tpr_i)
		plt.plot([0,1], [0,1], '--', color=(0.6, 0.6, 0.6))
		plt.plot(fpr[0], tpr[0], label="Default", linewidth=3)
		plt.xlim([-0.05, 1.05])
		plt.ylim([-0.05, 1.05])
		plt.show()


	def pickle_algo(self, X, fileName):
		print "pickling algorithm"
		f = open(fileName, 'wb')
		pickle.dump(X, f)
		f.close()
Ejemplo n.º 21
0
def run_experiments_without_cross_validation(model_names, features_to_use):
    dataset_features = utilities.load_features('pasokh')
    features, targets, labels, documents, all_vec = utilities.split_dataset(dataset_features, features_to_use, 0.40)

    X_normal = np.array(all_vec)

    utilities.normalize_dataset(X_normal, features_to_use, 'learn')

    X_train = np.array(features['train'])
    X_test = np.array(features['test'])
    y_train = np.array(targets['train'])
    y_test = np.array(targets['test'])
    labels_train = np.array(labels['train'])
    labels_test = np.array(labels['test'])

    print("Dataset size: {}".format(len(X_normal)))
    #print("Number of True/False labels: {}/{}".format(sum(labels), sum(1 for i in labels if not i)))
    (X_balanced, y_balanced, labels_balanced) = (X_train, y_train, labels_train)
    #X_balanced, y_balanced, labels_balanced = utilities.balance_dataset(X_train, y_train, labels_train, 3)
    print("Train set size: {}".format(len(X_balanced)))
    print("Number of True/False labels: {}/{}".format(sum(labels_balanced), sum(1 for i in labels_balanced if not i)))
    print("Test set size: {}".format(len(X_test)))
    print("Number of True/False labels: {}/{}".format(sum(labels_test), sum(1 for i in labels_test if not i)))
    print("Used features: {}".format(len(X_balanced[0])))

    dataset_json = json.loads(utilities.read_file('resources/pasokh/all.json'))
    is_regressor = True
    for model_type in model_names:
        print('**********************' + model_type + '**********************')
        if model_type == 'dtr':
            # max_depth=6
            regr = tree.DecisionTreeRegressor()
            regr = regr.fit(X_balanced, y_balanced)
            export_name = 'dtr'
        elif model_type == 'linear':
            regr = linear_model.LinearRegression(normalize=True)
            # Train the model using the training sets
            regr.fit(X_balanced, y_balanced)
            # The coefficients
            print('Coefficients: \n', regr.coef_)
            export_name = 'linear'
        elif model_type == 'svm':
            regr = SVR(verbose=True, epsilon=0.00001, gamma='auto', tol=.00001)
            # Train the model using the training sets
            regr.fit(X_balanced, y_balanced)
            # The coefficients
            print('Coefficients: \n', regr.get_params())
            export_name = 'svm'
        elif model_type == 'dummy':
            regr = RndRegressor()
            export_name = 'dummy'
        elif model_type == 'ideal':
            from IdealRegressor import IdealRegressor
            regr = IdealRegressor(X_train, y_train)
            regr.fit(X_test, y_test)
            export_name = 'ideal'
        elif model_type == 'nb':
            # from sklearn import svm
            # regr = svm.SVC(gamma='scale').fit(X_train, labels_train)
            from sklearn.naive_bayes import ComplementNB, GaussianNB
            from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
            from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

            regr = ComplementNB(alpha=1)
            regr.fit(X_train, labels_train)
            is_regressor = False
            export_name = 'nb'
        else:
            print("Regression type is undefined:" + model_type)
            continue
        # Make predictions using the testing set

        model_results = evaluate_model(regr, X_test, X_balanced, y_test, y_balanced, labels_test, labels_balanced, is_regressor)

        print('Summarizing dataset and evaluating Rouge...')
        rouge_scores = evaluate_summarizer(regr, dataset_json, features_to_use, True)
        utilities.print_rouges(rouge_scores)
        print('*****************************************************************************')
    return rouge_scores, model_results
Ejemplo n.º 22
0
print ''

svc = SVC(gamma=0.001, kernel='linear')
print 'SVC config:'
print svc.get_params()
svc.fit(smr_train.feature_matrix, smr_train.labels)
svc_score_train = svc.score(smr_train.feature_matrix, smr_train.labels)
print 'SVC precision train: {}'.format(svc_score_train)
svc_score_test = svc.score(smr_test.feature_matrix, smr_test.labels)
print 'SVC precision test: {}'.format(svc_score_test)
# plot_learning_curve(svc, 'SVC Curve', smr_train.feature_matrix, smr_train.labels, n_jobs=4)
print ''

svr = SVR()
print 'SVR config:'
print svr.get_params()
svr.fit(smr_train.feature_matrix, smr_train.labels)
svr_score_train = svr.score(smr_train.feature_matrix, smr_train.labels)
print 'SVR precision train: {}'.format(svr_score_train)
svr_score_test = svr.score(smr_test.feature_matrix, smr_test.labels)
print 'SVR precision test: {}'.format(svr_score_test)
# plot_learning_curve(svr, 'SVR Curve', smr_train.feature_matrix, smr_train.labels, n_jobs=4)
print ''

lsvc = LinearSVC()
print 'LinearSVC config:'
print lsvc.get_params()
lsvc.fit(smr_train.feature_matrix, smr_train.labels)
lsvc_score_train = lsvc.score(smr_train.feature_matrix, smr_train.labels)
print 'LinearSVC precision train: {}'.format(lsvc_score_train)
lsvc_score_test = lsvc.score(smr_test.feature_matrix, smr_test.labels)
#result_X = preprocessing.scale(result_X)

X_feats = result_X[:, 2:]
X_target = result_X[:, 1]

#pca = PCA(100)
#X_feats = pca.fit_transform(X_feats)

############clf = sv reg###############
clf = SVR(C = 1.0, epsilon = 0.2)
clf.fit(X_feats, X_target)

X_test = X_feats
Y_test = X_target
############

predicted_x = clf.predict(X_test)

a = normalized_gini(predicted_x, Y_test)
print a#("GINI Score : ", a)

P = clf.get_params()
#np.savetxt('svr1.txt', clf.coef_)
#result_X = np.column_stack(result + [[1]*len(result[0])])
#beta_hat = np.linalg.lstsq(result[1:[1, 2, 3]], result[1:,[0]].T)[0]
#print clf.coef_

with open('svr1.csv', 'wb') as csvfile:
    swriter = csv.writer(csvfile, delimiter=',')
    swriter.writerow([x for x in P])
Ejemplo n.º 24
0
def standard_experiment(train_df, test_df, feature_names, args):

    train_df['set'] = "train"  # annotate
    test_df['set'] = "test"  # annotate

    # clip training set, if necessary
    if (0 < args.limit_data < len(train_df)):
        print "Clipping training set to %d comments" % args.limit_data
        train_df = train_df[:args.limit_data]

    # Split into X, y for regression
    target = args.target
    train_X = train_df.filter(feature_names).as_matrix().astype(
        np.float)  # training data
    train_y = train_df.filter([target]).as_matrix().astype(
        np.float)  # training labels
    test_X = test_df.filter(feature_names).as_matrix().astype(
        np.float)  # test data
    test_y = test_df.filter([target
                             ]).as_matrix().astype(np.float)  # ground truth

    # For compatibility, make 1D
    train_y = train_y.reshape((-1, ))
    test_y = test_y.reshape((-1, ))

    print "Training set: %d examples" % (train_X.shape[0], )
    print "Test set: %d examples" % (test_X.shape[0], )
    print "Selected %d features" % (len(feature_names), )
    print 'Features: %s' % (' '.join(feature_names))

    ##
    # Preprocessing: scale data, keep SVM happy
    scaler = preprocessing.StandardScaler()
    train_X = scaler.fit_transform(
        train_X)  # faster than fit, transform separately
    test_X = scaler.transform(test_X)

    if args.classifier != 'baseline':
        if args.stock_params:
            if args.classifier == 'svr':
                print "Initializing SVR model"
                clf = SVR(**STANDARD_PARAMS['svr'])
            elif args.classifier == 'rf':
                print "Initializing RandomForestRegressor model, seed=%d" % args.rfseed
                clf = RandomForestRegressor(random_state=args.rfseed,
                                            **STANDARD_PARAMS['rf'])
            elif args.classifier == 'elasticnet':
                print "Initializing ElasticNet model"
                clf = ElasticNet(max_iter=10000,
                                 **STANDARD_PARAMS['elasticnet'])
            else:
                raise ValueError("Invalid classifier '%s' specified." %
                                 args.classifier)

        else:
            ##
            # Run Grid Search / 10xv on training/dev set
            start = time.time()
            print "== Finding optimal classifier using Grid Search =="
            params, clf = train_optimal_classifier(train_X,
                                                   train_y,
                                                   classifier=args.classifier,
                                                   rfseed=args.rfseed,
                                                   quickmode=args.quickmode)
            print "Optimal parameters: " + json.dumps(params, indent=4)
            if hasattr(clf, "support_vectors_"):
                print 'Number of support vectors: %d' % len(
                    clf.support_vectors_)
            print "Took %.2f minutes to train" % ((time.time() - start) / 60.0)

        if hasattr(clf, 'random_state'):
            clf.set_params(random_state=args.rfseed)
        clf.fit(train_X, train_y)
        params = clf.get_params()

    ##
    # Set up evaluation function
    if args.ndcg_weight == 'target':
        favfunc = evaluation.fav_target  # score weighting
    else:
        favfunc = evaluation.fav_linear  # rank weighting

    max_K = 20
    eval_func = lambda data: evaluation.ndcg(data,
                                             max_K,
                                             target=args.ndcg_target,
                                             result_label=result_label,
                                             fav_func=favfunc)

    ##
    # Predict scores for training set
    result_label = "pred_%s" % args.target  # e.g. pred_score
    if args.classifier != 'baseline':
        train_pred = clf.predict(train_X)
    else:  # baseline: post order
        train_pred = -1 * train_df['position_rank']
    train_df[result_label] = train_pred

    print 'Performance on training data (NDCG with %s weighting)' % args.ndcg_weight
    # ndcg_train = eval_func(train_df)
    ndcg_train = eval_func(
        train_df[train_df.parent_nchildren >= args.min_posts_ndcg])
    for i, score in enumerate(ndcg_train, start=1):
        print '\tNDCG@%d: %.5f' % (i, score)
    print 'Karma MSE: %.5f' % mean_squared_error(train_y, train_pred)

    ##
    # Predict scores for test set
    if args.classifier != 'baseline':
        test_pred = clf.predict(test_X)
    else:  # baseline: post order
        test_pred = -1 * test_df['position_rank']
    test_df[result_label] = test_pred

    print 'Performance on test data (NDCG with %s weighting)' % args.ndcg_weight
    # ndcg_test = eval_func(test_df)
    ndcg_test = eval_func(
        test_df[test_df.parent_nchildren >= args.min_posts_ndcg])
    for i, score in enumerate(ndcg_test, start=1):
        print '\tNDCG@%d: %.5f' % (i, score)
    print 'Karma MSE: %.5f' % mean_squared_error(test_y, test_pred)

    ##
    # Save model to disk
    if args.savename and (args.classifier != 'baseline'):
        import cPickle as pickle
        saveas = args.savename + ".model.pkl"
        print "== Saving model as %s ==" % saveas
        with open(saveas, 'w') as f:
            pickle.dump(clf, f)

    ##
    # Get feature importance, if possible
    if args.savename and (args.classifier != 'baseline'):
        feature_importances = get_feature_importance(
            clf, args.classifier, feature_names=feature_names, sorted=True)
        saveas = args.savename + ".topfeatures.txt"
        print "== Recording top features to %s ==" % saveas
        # np.savetxt(saveas, feature_importances)
        # with open(saveas, 'w') as f:
        # json.dump(feature_importances, f, indent=2)
        with open(saveas, 'w') as f:
            maxlen = max([len(fname) for fname in feature_importances[0]])
            f.write("# Model: %s\n" % args.classifier)
            f.write("# Params: %s\n" % json.dumps(params))
            for fname, val in zip(*feature_importances):
                f.write("%s  %.06f\n" % (fname.ljust(maxlen), val))
            f.flush()

    ##
    # Save data to HDF5
    if args.savename:

        # Save score predictions
        fields = [
            "self_id", "parent_id", 'cid', 'sid', 'set', args.target,
            result_label
        ]
        if not args.ndcg_target in fields:
            fields.append(args.ndcg_target)
        saveas = args.savename + ".scores.h5"
        print "== Saving raw predictions as %s ==" % saveas
        outdf = pd.concat([train_df[fields], test_df[fields]],
                          ignore_index=True)
        outdf.to_hdf(saveas, 'data')

        if args.savefull:
            # Concatenate train, test
            df = pd.concat([train_df, test_df], ignore_index=True)

            print "== Exporting data to HDF5 =="
            saveas = args.savename + ".data.h5"
            df.to_hdf(saveas, "data")
            print "  [saved as %s]" % saveas

        # Save NDCG calculations
        dd = {
            'k': range(1, max_K + 1),
            'method': [args.ndcg_weight] * max_K,
            'ndcg_train': ndcg_train,
            'ndcg_test': ndcg_test
        }
        resdf = pd.DataFrame(dd)
        saveas = args.savename + ".results.csv"
        print "== Saving results to %s ==" % saveas
        resdf.to_csv(saveas)
Ejemplo n.º 25
0
        else:
            noOfInstances += 1
            values = line.split('\n')[0]
            values = values.split(',')
            tempValues.extend(values[:noOfAttrs - 1])
            outputs.append(values[noOfAttrs - 1])
    for i in range(noOfInstances):
        for j in range(noOfAttrs - 1):
            row_ind.append(i)
            col_ind.append(j)

    tempValues = list(map(float, tempValues))
    dataset = sparse.coo_matrix((tempValues, (row_ind, col_ind))).toarray()
    outputs = list(map(float, outputs))
    return dataset, outputs


trainingDataset, trainingoutputs = getDataset('FileName_training.csv')

svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
print(svr_rbf.get_params())
print('Training Started....')
train_rbf = svr_rbf.fit(trainingDataset, trainingoutputs)
joblib.dump(train_rbf, 'Model.pkl')
'''svr_poly = SVR(kernel='poly', C=1e3, degree=2)
print(svr_poly.get_params())
print('Training Started....')
train_poly = svr_poly.fit(trainingDataset,trainingoutputs)
joblib.dump(train_poly,'Model.pkl')'''

print('Model Saved....')
Ejemplo n.º 26
0
        test_set.append([predictiveAttributeDegree[i][11], predictiveAttributeDegree[i][13]])
        test_result.append([predictiveAttributeDegree[i][2]])
train_percent = (len(predictiveAttributeNotDegree)/100)*80
count = 0
for i in range(len(predictiveAttributeNotDegree)):
    if count < train_percent:
        count = count + 1
        train_set.append([predictiveAttributeNotDegree[i][11], predictiveAttributeNotDegree[i][13]])
        train_result.append([predictiveAttributeNotDegree[i][2]])
    else:
        test_set.append([predictiveAttributeNotDegree[i][11], predictiveAttributeNotDegree[i][13]])
        test_result.append([predictiveAttributeNotDegree[i][2]])

svm_reg = SVR(kernel="poly", degree=2, C=50, epsilon=1.0, gamma="auto")
train_result = np.array(train_result)
svm_reg.fit(train_set, train_result.ravel())

print(svm_reg.score(test_set, test_result))
prediction = []
for item in test_set:
    items = [[item[0], item[1]]]
    prediction.append(svm_reg.predict(items))
pred = np.zeros(len(prediction))
predi = np.array(prediction)
for i in range(len(prediction)):
    pred[i] = predi[i][0]

print(("MSE: {}".format(mean_squared_error(pred, test_result))))
print("Params: ", svm_reg.get_params())

Ejemplo n.º 27
0
def run_experiments(model_names):
    """
    This version splits original texts in dataset for evaluating summaries
    """
    valid_features = ['cue_words', 'tfisf',
                      'cosine_position', 'relative_len',
                      # 'tf',
                      #'pos_ve_ratio', 'pos_aj_ratio', 'pos_nn_ratio', 'pos_av_ratio', 'pos_num_ratio', 'len', 'position'
                      'doc_words', 'doc_sens',# 'doc_parag', # 'category',
                      #'doc_verbs', 'doc_adjcs', 'doc_advbs', 'doc_nouns',
                      'nnf_isnnf', 'vef_isvef', 'ajf_isajf', 'avf_isavf', 'nuf_isnuf',
                      'political', 'social', 'sport', 'culture', 'economy', 'science'
                      ]
    features, targets, labels, documents, all_vec = load_dataset_splitted('features.json', learning_features)

    X_normal = np.array(all_vec)
    X_normal = select_features(valid_features, X_normal)
    # X_normal = StandardScaler().fit_transform(dataset[0])

    normalize_dataset(X_normal, valid_features, 'learn')

    X_train = np.array(features['train'])
    X_test = np.array(features['test'])
    y_train = np.array(targets['train'])
    y_test = np.array(targets['test'])
    labels_train = np.array(labels['train'])
    labels_test = np.array(labels['test'])

    X_train = select_features(valid_features, X_train)
    normalize_dataset(X_train, valid_features)

    X_test = select_features(valid_features, X_test)
    normalize_dataset(X_test, valid_features)

    print("Dataset size: {}".format(len(all_vec)))

    (X_balanced, y_balanced, labels_balanced) = (X_train, y_train, labels_train)
    #X_balanced, y_balanced, labels_balanced = balance_dataset(X_train, y_train, labels_train, 3)
    print("Train set size: {}".format(len(X_balanced)))
    print("Number of True/False labels: {}/{}".format(sum(labels_balanced), sum(1 for i in labels_balanced if not i)))
    print("Test set size: {}".format(len(X_test)))
    print("Number of True/False labels: {}/{}".format(sum(labels_test), sum(1 for i in labels_test if not i)))
    print("Used features: {}".format(len(X_balanced[0])))

    dataset_json = json.loads(read_file('resources/pasokh/all.json'))
    test_documents = {key: dataset_json[key] for key in documents['test']+documents['train']}
    is_regressor = True
    for model_type in model_names:
        print('**********************' + model_type + '**********************')
        if model_type == 'dtr':
            # max_depth=6
            regr = tree.DecisionTreeRegressor(max_depth=6)
            regr = regr.fit(X_balanced, y_balanced)
            export_name = 'dtr'
        elif model_type == 'linear':
            regr = linear_model.LinearRegression()
            # Train the model using the training sets
            regr.fit(X_balanced, y_balanced)
            # The coefficients
            print('Coefficients: \n', regr.coef_)
            export_name = 'linear'
        elif model_type == 'svm':
            regr = SVR(verbose=True, epsilon=0.001, gamma='auto', tol=.00001)
            # Train the model using the training sets
            regr.fit(X_balanced, y_balanced)
            # The coefficients
            print('Coefficients: \n', regr.get_params())
            export_name = 'svm'
        elif model_type == 'dummy':
            regr = RndRegressor()
            export_name = 'dummy'
        elif model_type == 'ideal':
            from IdealRegressor import IdealRegressor
            regr = IdealRegressor(X_normal, targets)
            export_name = 'ideal'
        elif model_type == 'nb':
            # from sklearn import svm
            # regr = svm.SVC(gamma='scale').fit(X_train, labels_train)
            from sklearn.naive_bayes import ComplementNB, GaussianNB
            from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
            from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

            regr = ComplementNB(alpha=0.015)
            regr.fit(X_train, labels_train)
            is_regressor = False
            export_name = 'nb'
        else:
            print("Regression type is undefined:" + model_type)
            continue
        # Make predictions using the testing set

        model_results = evaluate_model(regr, X_test, X_balanced, y_test, y_balanced, labels_test, labels_balanced, is_regressor)

        print('Summarizing dataset and evaluating Rouge...')

        rouge_scores = evaluate_summarizer(regr, test_documents, valid_features)
        utilities.print_rouges(rouge_scores)
        print('*****************************************************************************')
    return rouge_scores, model_results
Ejemplo n.º 28
0
def main():
    #picklef = open(config_file, 'r')
    #config_dict = pickle.load(picklef)

    print "\n========================="
    print "SURROGATE MODEL GENERATOR"
    print "========================="
    print "PARSE AND CLEAN DATA"
    print "========================="
    # load design and target data into a pandas dataframe from the input csv
    dataframe = pd.read_csv(input_data_file)

    # drop rows (samples) with NaNs in them
    dataframe = dataframe[dataframe.isnull() == False]

    # split the dataframe into design and target dataframes
    design_data = dataframe[features]
    design_labels = design_data.axes

    target_data = dataframe[targets]
    target_labels = target_data.axes

    if DEBUG:
        print "\nFeatures:\n", design_data
        print "\nTargets:\n", target_data

    print "\nParsed data shapes\n design data: ", np.shape(
        design_data), "\n target data: ", np.shape(target_data)
    print " #samples: %d\n #input parameters: %d" % (np.shape(design_data)[0],
                                                     np.shape(design_data)[1])
    print " #output parameters: %d" % np.shape(target_data)[1]

    if DEBUG:
        print "design data:"
        print design_data
        print "target_data:"
        print target_data

    if test_split > 0.0:
        print "\n========================="
        print "SPLIT TRAIN AND TEST DATASETS"
        print "========================="
        # split the data into a training set and a testing set for validation later.
        X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(
            design_data, target_data, test_size=test_split)

        print "\nX_train, Y_train:", np.shape(X_train), np.shape(Y_train)
        print "X_test, Y_test:", np.shape(X_test), np.shape(Y_test)
        print "training sample size: %d" % np.shape(X_train)[0]
        print "testing sample size: %d" % np.shape(X_test)[0]
        if DEBUG:
            print "X_train:\n", X_train
            print "Y_train:\n", Y_train
    else:
        X_train = design_data
        Y_train = target_data
        X_test, Y_test = [], []
    # standardize the training data to mean 0 and variance 1
    if normalize is True:
        print "\n========================="
        print "DATA NORMALIZATION AND SCALING"
        print "========================="

        # initialize a StandardScaler object to calculate the means and scaling values of each design
        # parameter (that is, it calculates the means and stdevs over the columns).
        # We then use the scaler object to transform the entire input data set (except for the design ID
        # number) to their normalized values.
        X_train_scaler = preprocessing.MinMaxScaler(
            feature_range=(0, 1)).fit(X_train)
        X_train_scaled = pd.DataFrame(X_train_scaler.transform(X_train),
                                      columns=X_train.axes[1])
        if test_split > 0.0:
            X_test_scaler = preprocessing.MinMaxScaler(
                feature_range=(0, 1)).fit(X_test)
            X_test_scaled = pd.DataFrame(X_test_scaler.transform(X_test),
                                         columns=X_test.axes[1])
        else:
            X_test_scaled = []

        print "\n feature min: ", X_train_scaler.data_min_
        print " feature max: ", X_train_scaler.data_max_
        print " feature range: ", X_train_scaler.data_range_
        print " feature scales: \n", X_train_scaler.scale_

        print "\nScaled training inputs:"
        print " shape: ", np.shape(X_train_scaled)

        if DEBUG:
            print "\n X_train_scaled:\n", X_train_scaled
            print "\nScaled testing inputs:"
            print " shape:", np.shape(X_test_scaled)
            print "\n X_test_scaled:\n", X_test_scaled

        Y_train_scaler = preprocessing.MinMaxScaler(
            feature_range=(0, 1)).fit(Y_train)
        Y_train_scaled = pd.DataFrame(Y_train_scaler.transform(Y_train),
                                      columns=Y_train.axes[1])
        if test_split > 0.0:
            Y_test_scaler = preprocessing.MinMaxScaler(
                feature_range=(0, 1)).fit(Y_test)
            Y_test_scaled = pd.DataFrame(Y_test_scaler.transform(Y_test),
                                         columns=Y_test.axes[1])
        else:
            Y_test_scaled = []

        print "\n output min: ", Y_train_scaler.data_min_
        print " output max: ", Y_train_scaler.data_max_
        print " output range: ", Y_train_scaler.data_range_
        print " output scales: \n", Y_train_scaler.scale_

        print "\nScaled training inputs:"
        print " shape: ", np.shape(Y_train_scaled)

        if DEBUG:
            print "\n Y_train_scaled:\n", Y_train_scaled
            print "\nScaled testing inputs:"
            print " shape:", np.shape(Y_test_scaled)
            print "\n Y_test_scaled:\n", Y_test_scaled
            #print "\nBefore scaling:"
            #print np.shape(X_train)
            #print X_train

        # This is just for visualizing the normalization transformations with histograms
        if DEBUG is True and 1:
            fig, axes = plt.subplots(np.shape(X_train)[1],
                                     sharex=True,
                                     sharey=True)
            for ax, label in izip(axes, X_train.axes[1]):
                ax.hist(X_train[label], bins=7)
                ax.set_title(label)
            fig.suptitle(
                "Distribution of design parameters before normalization")

            fig, axes = plt.subplots(np.shape(X_train_scaled)[1],
                                     sharex=True,
                                     sharey=True)
            print X_train_scaled.axes
            for ax, label in izip(axes, X_train_scaled.axes[1]):
                ax.hist(X_train_scaled[label], bins=7)
                ax.set_title(label)
            fig.suptitle(
                "Distribution of design parameters after normalization")

            if len(Y_train) is not 0 and len(Y_train_scaled) is not 0:
                fig, axes = plt.subplots(np.shape(Y_train)[1],
                                         sharex=True,
                                         sharey=True)
                for ax, label in izip(axes, Y_train.axes[1]):
                    ax.hist(Y_train[label], bins=7)
                    ax.set_title(label)
                fig.suptitle(
                    "Distribution of performance parameters before normalization"
                )

                fig, axes = plt.subplots(np.shape(Y_train_scaled)[1],
                                         sharex=True,
                                         sharey=True)
                for ax, label in izip(axes, Y_train_scaled.axes[1]):
                    ax.hist(Y_train_scaled[label], bins=7)
                    ax.set_title(label)
                fig.suptitle(
                    "Distribution of performance parameters after normalization"
                )
            plt.show()
    else:
        X_train_scaled = X_train
        X_test_scaled = X_test
    print "\n========================="
    print "SUPPORT VECTOR REGRESSION"
    print "========================="

    surrogate_models = [
    ]  # Array to hold the surrogate model objects for each output parameter

    # If gridsearch is True, use scikit-learn's gridsearch to systematically search for optimal
    # hyperparameter values. Else, we use hyperparameter values set by the user to construct and
    # train surrogate models for each performance variable.
    if gridsearch:
        # construct a surrogate model for each output parameter (performance metric)
        print "My God... They're learning..."
        for n, target_parameter in enumerate(Y_train_scaled):
            print "\n------------------------"
            print target_parameter
            print "------------------------"
            if DEBUG: print Y_train_scaled[target_parameter]
            model = generate_optimized_surrogate(
                X_train_scaled,
                Y_train_scaled[target_parameter],
                label=target_parameter,
                C_range=C_range,
                epsilon_range=epsilon_scale,
                grid_iter=optimize_iter,
                scoring=model_scoring)
            surrogate_models.append(model)
    else:
        for n, target_parameter in enumerate(Y_train_scaled):
            print "\n------------------------"
            print target_parameter
            print "------------------------"
            model = SVR(kernel='rbf',
                        C=C_tuple[n],
                        epsilon=epsilon_tuple[n],
                        gamma='auto').fit(X_train_scaled,
                                          Y_train_scaled[target_parameter])
            surrogate_models.append(model)

    print "\nSurrogate models:\n", surrogate_models
    """
    print np.shape(surrogate_model)
    print surrogate_model
    # make predictions over the output surrogate data.
    #prediction_outputs = [model.predict(X_train_scaled) for model in surrogate_model]
    prediction_outputs = surrogate_model[1].predict(X_train_scaled)
    print np.shape(prediction_outputs)
    print prediction_outputs
    """

    # If the sampled data was split into training and testing sets, evaluate the generated models
    # on the testing data. Otherwise, compute cross-validated scores using the training data.

    # First, instantiate a list to hold our scaler (transformation) objects to transform the values
    # predicted by the models to the range of the performance metrics being modeled.
    Y_scalers = []
    for n, model in enumerate(surrogate_models):
        print "\n------------------------"
        print targets[n]
        print "------------------------"

        if test_split > 0.0:
            print "\n========================="
            print "MODEL EVALUATION"
            print "========================="
            predictions = model.predict(X_test_scaled)
            target_values = Y_test[targets[n]]
            # reverse-transform the outputs and predictions back to their original values
            Y_test_scaler = preprocessing.MinMaxScaler().fit(
                Y_test[targets[n]].reshape(-1, 1))
            predictions = Y_test_scaler.inverse_transform(
                predictions.reshape(-1, 1))

            #print Y_test[:,n]
            #print predictions
            #result_array = np.column_stack((Y_test[:,n].reshape(-1,1), predictions))

            print "test values, predicted values"
            print target_values, predictions
            print "model score:", metrics.mean_squared_error(
                target_values, predictions)
            #print "model score: ", model.score(target_values, predictions)
            print "model parameters:"
            parameters = model.get_params()
            print ' C: ', parameters['C']
            print ' epsilon: ', parameters['epsilon']
            #print ' gamma: ', parameters['gamma']

        # If a testing set was not set aside, use Leave-One-Out (LOO) cross-validation
        else:
            scaled_target_values = Y_train_scaled[targets[n]].values
            target_values = Y_train[targets[n]].values

            scores = cross_validation.cross_val_score(
                model,
                X_train_scaled.values,
                scaled_target_values,
                scoring='mean_squared_error',
                cv=len(Y_train_scaled))

            avg_score = np.mean(scores)
            score_std = np.std(scores)
            print "model avg score: %1.5f (+/-%1.5f)" % (-avg_score, score_std)

            predictions = cross_validation.cross_val_predict(
                model,
                X_train_scaled.values,
                scaled_target_values,
                cv=len(Y_train_scaled))

            # Make a scaler and inverse transform the predictions back to their original, unscaled ranges
            Y_test_scaler = preprocessing.MinMaxScaler().fit(target_values)
            predictions = Y_test_scaler.inverse_transform(predictions)
            Y_scalers.append(Y_test_scaler)
            print "Y_scalers[%d]: " % n, Y_scalers[n]

        # plot the predicted vs actual values
        fig, ax = plt.subplots()
        ax.scatter(predictions, target_values, marker='x')
        ax.plot(target_values, target_values, c='b', linestyle='--')
        ax.set_xlabel("Predicted Values")
        ax.set_ylabel("Actual Values")
        ax.set_title("Predicted vs Actual Target Values: %s" % targets[n])

        fig.savefig('%s%s_%s_predicted_vs_actual.png' %
                    (output_directory, data_title, targets[n]))
    """
    if test_split > 0.0:
        print "\n========================="
        print "MODEL EVALUATION"
        print "========================="

        # step through each model and evaluate its performance on the testing data
        for n, model in enumerate(surrogate_models):
            print "\n------------------------"
            print targets[n]
            print "------------------------"
            predictions = model.predict(X_test_scaled)
            target_values = Y_test[targets[n]]
            # reverse-transform the outputs and predictions back to their original values
            Y_test_scaler = preprocessing.MinMaxScaler().fit(Y_test[targets[n]].reshape(-1,1))
            predictions = Y_test_scaler.inverse_transform(predictions.reshape(-1,1))

            #print Y_test[:,n]
            #print predictions
            #result_array = np.column_stack((Y_test[:,n].reshape(-1,1), predictions))

            print "test values, predicted values"
            print target_values, predictions
            print "model score:", metrics.mean_squared_error(target_values, predictions)
            #print "model score: ", model.score(target_values, predictions)
            print "model parameters:"
            parameters = model.get_params()
            print ' C: ', parameters['C']
            print ' epsilon: ', parameters['epsilon']
            #print ' gamma: ', parameters['gamma']

            # plot the predicted vs actual values
            fig, ax = plt.subplots()
            ax.scatter(predictions, target_values, marker = 'x')
            ax.plot(target_values, target_values, c='b', linestyle='--')
            ax.set_xlabel("Predicted Values")
            ax.set_ylabel("Actual Values")
            ax.set_title("Predicted vs Actual Target Values: %s" %targets[n])

            fig.savefig('%s%s_predicted_vs_actual.png' %(output_directory, targets[n]))

    else:
        print "\n========================="
        print "MODEL CROSS-VALIDATION"
        print "========================="

        # Use cross-validation to evaluate the models created above
        for n, model in enumerate(surrogate_models):
            print "\n------------------------"
            print targets[n]
            print "------------------------"

            scaled_target_values = Y_train_scaled[targets[n]].values
            target_values = Y_train[targets[n]].values

            scores = cross_validation.cross_val_score(model, 
                                                      X_train_scaled.values, 
                                                      scaled_target_values,
                                                      scoring = 'mean_squared_error',
                                                      cv = len(Y_train_scaled))

            avg_score = np.mean(scores)
            score_std = np.std(scores)
            print "model avg score: %1.5f (+/-%1.5f)" %(-avg_score, score_std)

            predictions = cross_validation.cross_val_predict(model,
                                                             X_train_scaled.values,
                                                             scaled_target_values,
                                                             cv = len(Y_train_scaled))

            # Make a scaler and inverse transform the predictions back to their original, unscaled ranges
            Y_test_scaler = preprocessing.MinMaxScaler().fit(target_values)
            predictions = Y_test_scaler.inverse_transform(predictions)

            # plot the predicted vs actual values
            fig, ax = plt.subplots()
            ax.scatter(predictions, target_values, marker = 'x')
            ax.plot(target_values, target_values, c='b', linestyle='--')
            ax.set_xlabel("Predicted Values")
            ax.set_ylabel("Actual Values")
            ax.set_title("Predicted vs Actual Target Values: %s" %targets[n])

            fig.savefig('%s%s_predicted_vs_actual.png' %(output_directory, targets[n]))
    """
    if save_models is True:
        model_file = data_title + "_surrogate_models.pkl"
        input_scaler_file = data_title + "_input_scalers.pkl"
        scaler_file = data_title + "_datascalers.pkl"
        models_savefile = output_directory + model_file
        input_scalers_savefile = output_directory + input_scaler_file
        scalers_savefile = output_directory + scaler_file
        #models_savefile = "%s%s_surrogate_models.pkl" %(output_directory, data_name)
        #scalers_savefile = "%s%s_datascalers.pkl" %(output_directory, data_name)

        with open(models_savefile, 'w') as f:
            pickle.dump(surrogate_models, f)

        with open(input_scalers_savefile, 'w') as f:
            pickle.dump(X_train_scaler, f)

        with open(scalers_savefile, 'w') as f:
            pickle.dump(Y_scalers, f)

    return surrogate_models, Y_scalers
Ejemplo n.º 29
0
dataset = pd.read_csv('ReceivingTimes.csv')
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,4].values

# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.25, random_state = 42)

from sklearn.svm import SVR
regressor=SVR(kernel='linear',degree=1)
regressor.fit(xtrain, ytrain)

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
regressor.get_params()

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
C = [0.001, 0.01, 0.1,1,10]
# Number of features to consider at every split
kernel = ['linear', 'poly']
# Maximum number of levels in tree
epsilon = [0.001, 0.01, 0.1,1,10]
gamma = [0.001, 0.01, 0.1,1,10]

# Create the random grid
random_grid = {'C': C,
               'kernel': kernel,
               'epsilon': epsilon,
               'gamma': gamma}
ytr.reset_index(inplace=True)
ytr.drop(['index'], axis = 1, inplace=True)

X = ly_test['date']

retry = ytr['surge']       # surge in training data
horizontal = ytr['date']   # date in training data

# attempt support vector regression
svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
svr_lin = SVR(kernel='linear', C=100, gamma='auto')
svr_poly = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=.1,
               coef0=1)

# Tune the hyperparameters
svr_rbf.get_params()

svr_params = {'kernel': ['rbf'], 'C': [0.1,1,10,20,50], 'gamma':[1, 0.1, 0.01, 0.001]}
tune = GridSearchCV(SVR(), svr_params, cv=5)
tune.fit(lx_norm_train,ly_train['surge'])
tune.cv_results_

print("Best score: ", tune.best_score_)         #0.727 (2-yr. data)
print("Best parameters: ", tune.best_params_)

# Try with the best parameters (2-yr data)
#svr_rbf = SVR(kernel='rbf', C=1, gamma=0.001)

# Best parameters for cuxhaven.de (~5yr. data)
# score: 0.831, Best parameters:  {'C': 10, 'gamma': 0.001
#svr_rbf = SVR(kernel='rbf', C=10, gamma=0.001)
# regressor = SVR(kernel='rbf',C=5.0, epsilon=0.1)
regressor = SVR()
regressor.fit(XTrain, yTrain)

# Calculate errors
yTestPredict = regressor.predict(XTest)
mse = mean_squared_error(yTest, yTestPredict, squared=True)
rmse = mean_squared_error(yTest, yTestPredict, squared=False)
mae = mean_absolute_error(yTest, yTestPredict)
mape = mean_absolute_percentage_error(yTest, yTestPredict)
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))
print("The root Mean Square Error (RMSE) on test set: {:.4f}".format(rmse))
print("The mean absolute error on test set: {:.4f}".format(mae))
print("The mean absolute percentage error on test set: {:.4f}".format(mape))
print(regressor.get_params(deep=True))

# plt.plot(degreeGrid, mseValues, color='blue')
# plt.xlabel('degree values')
# plt.ylabel('Mean square error values')
# plt.title('kernel = poly')
# plt.show()
#
# plt.plot(degreeGrid, rmseValues, color='red')
# plt.xlabel('degree values')
# plt.ylabel('Root mean square error values')
# plt.title('kernel = poly')
# plt.show()
#
# plt.plot(degreeGrid, maeValues, color='green')
# plt.xlabel('degree values')
count = 0
for i in range(len(predictiveAttributeNotDegree)):
    if count < train_percent:
        count = count + 1
        train_set_tot.append([predictiveAttributeNotDegree[i][0], predictiveAttributeNotDegree[i][1], predictiveAttributeNotDegree[i][6],
                          predictiveAttributeNotDegree[i][7], predictiveAttributeNotDegree[i][8], predictiveAttributeNotDegree[i][9],
                          predictiveAttributeNotDegree[i][10], predictiveAttributeNotDegree[i][11], predictiveAttributeNotDegree[i][12],
                          predictiveAttributeNotDegree[i][13], predictiveAttributeNotDegree[i][17]])
        train_result_tot.append([predictiveAttributeNotDegree[i][2]])
    else:
        test_set_tot.append([predictiveAttributeNotDegree[i][0], predictiveAttributeNotDegree[i][1], predictiveAttributeNotDegree[i][6],
                          predictiveAttributeNotDegree[i][7], predictiveAttributeNotDegree[i][8], predictiveAttributeNotDegree[i][9],
                          predictiveAttributeNotDegree[i][10], predictiveAttributeNotDegree[i][11], predictiveAttributeNotDegree[i][12],
                          predictiveAttributeNotDegree[i][13], predictiveAttributeNotDegree[i][17]])
        test_result_tot.append([predictiveAttributeNotDegree[i][2]])

train_result_tot = np.array(train_result_tot)
svm_reg_tot.fit(train_set_tot, train_result_tot.ravel())

print("----ALL ATTRIBUTE: score: ", svm_reg_tot.score(test_set_tot, test_result_tot))
prediction = []
for item in test_set_tot:
    items = [[item[0], item[1], item[2], item[3], item[4], item[5], item[6], item[7], item[8], item[9], item[10]]]
    prediction.append(svm_reg_tot.predict(items))
pred = np.zeros(len(prediction))
predi = np.array(prediction)
for i in range(len(prediction)):
    pred[i] = predi[i][0]
print(("MSE: {}".format(mean_squared_error(pred, test_result_tot))))
print("----ALL ATTRIBUTE: Params: ", svm_reg_tot.get_params())