Ejemplo n.º 1
0
def load_regressor_SVR(X_train, y_train):
    best_estimator_svr = joblib.load('./built_models/regressor_SVR.pkl')
    regressor_svr = SVR()
    regressor_svr.set_params(**best_estimator_svr)
    print regressor_svr
    regressor_svr.fit(X_train, y_train)

    return regressor_svr
Ejemplo n.º 2
0
def svr(x_train, x_test, y_train, y_test, model_processing_params):
    with open("tuning/regression/svr.json") as svr_params_file:
        file_data = json.load(svr_params_file)

    param_grid = file_data["param_grid"]

    svr_instance = SVR()

    best_score = 0
    best_params = None
    first_iter = True

    for g in ParameterGrid(param_grid):
        svr_instance.set_params(**g)
        svr_instance.fit(x_train, y_train)

        new_score = svr_instance.score(x_test, y_test)

        # save if best
        if first_iter:
            best_score = new_score
            best_params = g
            first_iter = False
        elif new_score > best_score:
            best_score = new_score
            best_params = g

    all_x = np.vstack((x_train, x_test))
    all_y = np.append(y_train, y_test)
    r2_score = svr_instance.score(all_x, all_y)

    y_test_predict = svr_instance.predict(x_test)
    mse = mean_squared_error(y_test, y_test_predict)

    print("\nSVR")
    print("MSE: " + str(mse))
    print("R2 score: " + str(r2_score))
    print("Best Score:" + str(best_score))
    print("Best params:")
    print(best_params)

    if not path.exists("./models/regression/svr"):
        makedirs("./models/regression/svr")
    dump(svr_instance, "./models/regression/svr/model.dump")

    model_stats = {
        "mse": mse,
        "r2_score": r2_score,
        "best_score": best_score,
        "best_params": best_params
    }

    with open("./models/regression/svr/model_stats.json", "w") as outfile:
        json.dump(model_stats, outfile, indent=4)

    with open("./models/regression/svr/model_processing_params.json",
              "w") as outfile:
        json.dump(model_processing_params, outfile, indent=4)
def svr_cv(cv_outer, data):

    MAE_results = []
    RMSE_results = []
    MedAE_results = []
    r2_results = []
    model_params = []

    for train_index, test_index in cv_outer:

        X_train, y_train = data.iloc[train_index,
                                     6:].values, data.iloc[train_index,
                                                           3].values
        X_test, y_test = data.iloc[test_index,
                                   6:].values, data.iloc[test_index, 3].values

        cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)

        model = SVR(kernel='rbf')
        params = {
            'C': np.arange(100, 250, 10),
            'epsilon': [0.0001, 0.001, 0.01],
            'gamma': [0.005, 0.006, 0.007, 0.008, 0.009, 0.01]
        }

        search = RandomizedSearchCV(model,
                                    params,
                                    cv=cv_inner,
                                    scoring='neg_mean_absolute_error',
                                    verbose=0,
                                    n_jobs=-1,
                                    n_iter=100,
                                    refit=False,
                                    random_state=0)

        search.fit(X_train, y_train)

        model_params.append(search.best_params_)

        model.set_params(**search.best_params_)
        model.fit(X_train, y_train)
        #         print(search.best_params_)
        y_pred = model.predict(X_test)

        mae = MAE(y_test, y_pred)
        MAE_results.append(mae)

        rmse = mean_squared_error(y_test, y_pred, squared=False)
        RMSE_results.append(rmse)

        med = median_absolute_error(y_test, y_pred)
        MedAE_results.append(med)

        r2 = r2_score(y_test, y_pred)
        r2_results.append(r2)

    return MAE_results, RMSE_results, MedAE_results, r2_results, model_params
Ejemplo n.º 4
0
 def reg_objective(hyperparams):
     clf = SVR()
     clf.set_params(**hyperparams)
     model = clf.fit(X_train, y_train)
     pred = model.predict(X_val)
     score = np.sqrt(((pred - y_val)**2).mean())
     if np.isnan(score):
         score = 1e8
     return {'loss': score, 'status': STATUS_OK}
Ejemplo n.º 5
0
def svr_from_config(config):
    m = SVR()
    m.set_params(**config['params'])

    for attr, v in config['attributes'].items():
        dtype = config['attributes_types'].get(attr, 'float64')
        if isinstance(v, list):
            v = np.array(v, dtype=dtype)
        m.__setattr__(attr, v)

    return m
Ejemplo n.º 6
0
    def update_event(self, input_called=-1):
        if input_called == 0:
            regr = SVR()
            if self.input(1) != None:
                regr.set_params(**self.input(1))
            X = self.input(2)
            y = self.input(3)

            regr.fit(X, y)
            self.set_output_val(1, regr)

            self.exec_output(0)
Ejemplo n.º 7
0
def train_and_save_final_model(X, y, X_train, y_train, params,
                               save_model_file_path, test_data):
    svr = SVR()
    svr.set_params(**params)

    if test_data == None:
        svr.fit(X_train, y_train)
    else:
        svr.fit(X, y)

    #save model
    model_file_path = save_model_file_path + 'svr.sav'
    pickle.dump(svr, open(model_file_path, 'wb'))
Ejemplo n.º 8
0
        def SVM_regression(X_train, y_train, X_test, params):
            # Соединим нашу выборку для процедуры стандартизации
            sample = np.vstack((X_train, X_test))

            # Стандартизуем выборку и снова разделяем
            sample = preprocessing.scale(sample)
            X_train = sample[:-1, :]
            X_test = sample[-1:, :]

            # Случайный поиск по сетке
            if hyperparameters == 'RandomGridSearch':
                # Осуществляем поиск по сетке с кросс-валидацией (число фолдов равно 3)
                Cs = [0.001, 0.01, 0.1, 1, 10]
                epsilons = [0.1, 0.4, 0.7, 1.0]
                param_grid = {'C': Cs, 'epsilon': epsilons}
                # Задаем модель, которую будем обучать
                estimator = SVR(kernel = 'linear', gamma = 'scale')
                # Производим обучение модели с заданными вариантами параметров (осуществляем поиск по сетке)
                optimizer = RandomizedSearchCV(estimator, param_grid, n_iter = 5, cv = 3, iid = 'deprecated', scoring = 'neg_mean_absolute_error')
                optimizer.fit(X_train, np.ravel(y_train))
                regression = optimizer.best_estimator_
                predicted = regression.predict(X_test)
                validation_score = optimizer.best_score_
            # Полный поиск по сетке
            elif hyperparameters == 'GridSearch':
                Cs = [0.001, 0.01, 0.1, 1, 10]
                epsilons = [0.1, 0.4, 0.7, 1.0]
                param_grid = {'C': Cs, 'epsilon': epsilons}
                # Задаем модель, которую будем обучать
                estimator = SVR(kernel = 'linear', gamma = 'scale')
                # Производим обучение модели с заданными вариантами параметров (осуществляем поиск по сетке)
                optimizer = GridSearchCV(estimator, param_grid, cv = 3, iid = 'deprecated', scoring = 'neg_mean_absolute_error')
                optimizer.fit(X_train, np.ravel(y_train))
                regression = optimizer.best_estimator_
                predicted = regression.predict(X_test)
                validation_score = optimizer.best_score_
            elif hyperparameters == 'Custom':
                estimator = SVR()
                # Задаем нужные параметры
                estimator.set_params(**params)

                # Проверка по кросс-валидации
                fold = KFold(n_splits = 3, shuffle = True)
                validation_score = cross_val_score(estimator = estimator, X = X_train, y = np.ravel(y_train), cv = fold, scoring = 'neg_mean_absolute_error')

                # Обучаем модель уже на всех данных
                estimator.fit(X_train, np.ravel(y_train))
                predicted = estimator.predict(X_test)

            return(predicted, validation_score)
Ejemplo n.º 9
0
class SVM():
    def __init__(self, task='cls', **kwargs):
        if task == 'cls':
            self.svm = SVC(**kwargs)
            self._name = 'SVC'
        elif task == 'prd':
            self.svm = SVR(**kwargs)
            self._name = 'SVR'

    def decision_function(self, X):
        '''
            X (n_samples, n_features)
            return:  X (n_samples, n_classes * (n_classes-1) / 2)
        '''
        if self._name == 'SVC':
            return self.svm.decision_function(X)

    def fit(self, X, y, sample_weight=None):
        '''
            X (n_samples, n_features)
            y (n_samples,)
            sample_weight (n_samples,)
        '''
        return self.svm.fit(X, y, sample_weight)

    def get_params(self, deep=True):
        return self.svm.get_params(deep)

    def predict(self, X):
        return self.svm.predict(X)

    def score(self, X, y, sample_weight=None):
        '''
            X (n_samples, n_features)
            y (n_samples,) or (n_samples, n_outputs)
            sample_weight (n_samples,), default=None
        '''
        return self.svm.score(X, y, sample_weight)

    def set_params(self, **params):
        '''
            **params dict
        '''
        return self.svm.set_params(**params)
Ejemplo n.º 10
0
class Baseline:
    def __init__(self, city, dest_name):
        self.city = city
        self.dest_name = dest_name
        print 'Baseline implementation for {:s} : {:s}'.format(
            self.city, self.dest_name)
        dest_to_idx = {
            'bofa': 0,
            'church': 1,
            'gas_station': 3,
            'high_school': 3,
            'mcdonalds': 4
        }
        self.idx = dest_to_idx[self.dest_name]
        self.base_dir = osp.join('../data/dataset', city)
        self.train_label_filename = osp.join(self.base_dir, 'distance',
                                             'train_labels.txt')
        self.train_im_list_filename = osp.join(self.base_dir, 'distance',
                                               'train_im_list.txt')
        self.test_label_filename = osp.join(self.base_dir, 'distance',
                                            'test_labels.txt')
        self.test_im_list_filename = osp.join(self.base_dir, 'distance',
                                              'test_im_list.txt')
        self.svr = SVR(kernel='linear',
                       shrinking=False,
                       cache_size=10000,
                       verbose=True)
        # self.svr = LinearSVR(verbose=1)

    def collect_train_data_parallel(self):
        with open(self.train_im_list_filename, 'r') as train_f_im,\
            open(self.train_label_filename, 'r') as train_f_label:
            train_im_names = [
                osp.join('../data/dataset',
                         l.rstrip().split(' ')[0]) for l in train_f_im
            ]
            train_labels = [
                float(l.rstrip().split(' ')[self.idx]) for l in train_f_label
            ]

        # get dims
        ge = GISTExtractor(width=256, height=256)
        im = cv2.imread(train_im_names[0])
        gist_features = ge.extract_gist(im)
        self.train_X = np.zeros((len(train_im_names), gist_features.shape[0]),
                                dtype=np.float)
        self.train_y = np.asarray(train_labels)

        # parallel feature extraction!
        print 'Collecting training data'
        pool = Pool(initializer=pool_init, initargs=(256, 256))
        chunksize = len(train_im_names) / 4
        for idx, feat in enumerate(
                pool.imap(gist_wrapper, train_im_names, chunksize)):
            self.train_X[idx, :] = feat

        pool.close()
        pool.join()

    def collect_train_data_serial(self):
        with open(self.train_im_list_filename, 'r') as train_f_im,\
            open(self.train_label_filename, 'r') as train_f_label:
            train_im_names = [
                osp.join('../data/dataset',
                         l.rstrip().split(' ')[0]) for l in train_f_im
            ]
            train_labels = [
                float(l.rstrip().split(' ')[self.idx]) for l in train_f_label
            ]

        # get dims
        ge = GISTExtractor(width=256, height=256)
        im = cv2.imread(train_im_names[0])
        gist_features = ge.extract_gist(im)
        self.train_X = np.zeros((len(train_im_names), gist_features.shape[0]),
                                dtype=np.float)
        self.train_y = np.asarray(train_labels)

        db = lmdb.open('../data/dataset/gist',
                       map_size=int(1e12),
                       readonly=True)
        txn = db.begin()

        # serial feature extraction!
        print 'Collecting training data'
        for idx, im_name in enumerate(train_im_names):
            if idx % 100 == 0:
                print 'Image {:d} / {:d}'.format(idx, len(train_im_names))
            key = get_key(im_name)
            self.train_X[idx, :] = np.fromstring(txn.get(key))

    def collect_test_data_parallel(self):
        with open(self.test_im_list_filename, 'r') as test_f_im,\
            open(self.test_label_filename, 'r') as test_f_label:
            test_im_names = [
                osp.join('../data/dataset',
                         l.rstrip().split(' ')[0]) for l in test_f_im
            ]
            test_labels = [
                float(l.rstrip().split(' ')[self.idx]) for l in test_f_label
            ]

        # get dims
        ge = GISTExtractor(width=256, height=256)
        im = cv2.imread(test_im_names[0])
        gist_features = ge.extract_gist(im)
        self.test_X = np.zeros((len(test_im_names), gist_features.shape[0]),
                               dtype=np.float)
        self.test_y = np.asarray(test_labels)

        # parallel feature extraction!
        print 'Collecting testing data'
        pool = Pool(initializer=pool_init, initargs=(256, 256))
        chunksize = len(test_im_names) / 4
        for idx, feat in enumerate(
                pool.imap(gist_wrapper, test_im_names, chunksize)):
            self.test_X[idx, :] = feat
        pool.close()
        pool.join()

    def collect_test_data_serial(self):
        with open(self.test_im_list_filename, 'r') as test_f_im,\
            open(self.test_label_filename, 'r') as test_f_label:
            test_im_names = [
                osp.join('../data/dataset',
                         l.rstrip().split(' ')[0]) for l in test_f_im
            ]
            test_labels = [
                float(l.rstrip().split(' ')[self.idx]) for l in test_f_label
            ]

        # get dims
        ge = GISTExtractor(width=256, height=256)
        im = cv2.imread(test_im_names[0])
        gist_features = ge.extract_gist(im)
        self.test_X = np.zeros((len(test_im_names), gist_features.shape[0]),
                               dtype=np.float)
        self.test_y = np.asarray(test_labels)

        db = lmdb.open('../data/dataset/gist',
                       map_size=int(1e12),
                       readonly=True)
        txn = db.begin()

        # serial feature extraction!
        print 'Collecting testing data'
        for idx, im_name in enumerate(test_im_names):
            if idx % 100 == 0:
                print 'Image {:d} / {:d}'.format(idx, len(test_im_names))
            key = get_key(im_name)
            self.test_X[idx, :] = np.fromstring(txn.get(key))

    def train(self, C=1.0, calc_loss=False):
        print 'Training with C = {:f}'.format(C)
        p = self.svr.get_params()
        p['C'] = C
        self.svr.set_params(**p)
        self.svr.fit(self.train_X, self.train_y)
        loss = 0
        if calc_loss:
            test_y_pred = self.svr.predict(self.test_X)
            loss = np.linalg.norm(test_y_pred - self.test_y)
            # score = self.svr.score(self.test_X, self.test_y)
            print 'Loss = {:f}'.format(loss)
        return loss

    def cross_validate(self):
        C = np.power(10.0, xrange(-2, 5))
        losses = np.array([self.train(c, calc_loss=True) for c in C])
        idx = np.argmin(losses)
        print 'Best C = {:f}'.format(C[idx])

    def save_current_model(self):
        model_filename = osp.join(self.base_dir, 'distance',
                                  '{:s}.pkl'.format(self.dest_name))
        joblib.dump(self.svr, model_filename)
        print model_filename, 'saved'
## train

## set model
# lasso
lasso = Lasso(normalize=True)

# ridge
ridge = Ridge(normalize=True)

# elasticnet
elasticnet = ElasticNet(normalize=True)

# SVR regression
svr = SVR()
svr.set_params(C=0.045, epsilon=0.06, kernel='linear')

# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.01,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)

#Random Forest Regressor
randomForest = RandomForestRegressor(n_estimators=1200,
                                     max_depth=15,
                                     min_samples_split=5,
Ejemplo n.º 12
0
        def svm_regression(X_train,
                           y_train,
                           X_test,
                           params,
                           use_cv: bool = True):
            # Combine our sample for the standardization procedure
            sample = np.vstack((X_train, X_test))

            # Standardize the sample and split again
            sample = preprocessing.scale(sample)
            X_train = sample[:-1, :]
            X_test = sample[-1:, :]

            # If there are not enough points for cross validation
            if use_cv is False:
                if params is None:
                    model = SVR()
                else:
                    model = SVR(**params)
                model.fit(X_train, y_train)
                predicted = model.predict(X_test)

                # Calculate score on train
                train_predicted = model.predict(X_train)
                validation_score = mean_absolute_error(
                    np.ravel(y_train), np.ravel(train_predicted))
                return predicted, validation_score

            # Random grid search
            if hyperparameters == 'RandomGridSearch':
                # Carry out a random grid search with cross-validation (the number of folds is 3)
                Cs = [0.001, 0.01, 0.1, 1, 10]
                epsilons = [0.1, 0.4, 0.7, 1.0]
                param_grid = {'C': Cs, 'epsilon': epsilons}
                # Set the model to be trained
                estimator = SVR(kernel='linear', gamma='scale')
                # Train the model with the given options of parameters
                optimizer = RandomizedSearchCV(
                    estimator,
                    param_grid,
                    n_iter=5,
                    cv=3,
                    iid='deprecated',
                    scoring='neg_mean_absolute_error')
                optimizer.fit(X_train, np.ravel(y_train))
                regression = optimizer.best_estimator_
                predicted = regression.predict(X_test)
                validation_score = optimizer.best_score_
            # Full grid search
            elif hyperparameters == 'GridSearch':
                Cs = [0.001, 0.01, 0.1, 1, 10]
                epsilons = [0.1, 0.4, 0.7, 1.0]
                param_grid = {'C': Cs, 'epsilon': epsilons}
                # Set the model to be trained
                estimator = SVR(kernel='linear', gamma='scale')
                # Train the model with the given options of parameters
                optimizer = GridSearchCV(estimator,
                                         param_grid,
                                         cv=3,
                                         iid='deprecated',
                                         scoring='neg_mean_absolute_error')
                optimizer.fit(X_train, np.ravel(y_train))
                regression = optimizer.best_estimator_
                predicted = regression.predict(X_test)
                validation_score = optimizer.best_score_
            elif hyperparameters == 'Custom':
                estimator = SVR()
                # Set the params
                estimator.set_params(**params)

                # Cross-validation
                fold = KFold(n_splits=3, shuffle=True)
                validation_score = cross_val_score(
                    estimator=estimator,
                    X=X_train,
                    y=np.ravel(y_train),
                    cv=fold,
                    scoring='neg_mean_absolute_error')
                estimator.fit(X_train, np.ravel(y_train))
                predicted = estimator.predict(X_test)

            return predicted, validation_score
    X = scaler.fit_transform(X)
    Xt = scaler.transform(Xt)
        
    ##############################################################
    # stacking result = svr + α * rfr + β * gbr
    # tune α and β with cross validation
    ##############################################################
    scores = dict()
    skf = cross_validation.StratifiedKFold(Y, n_folds=3)
    for train_index, test_index in skf:
        X1, X2 = X[train_index], X[test_index]
        Y1, Y2 = Y[train_index], Y[test_index]
        
        # predict with SVR
        svr = SVR()
        svr.set_params(**pickle.load(open("svr.p", "rb" )))
        svr.fit(X1, Y1)
        Y_svr = svr.predict(X2)

        # predict with RF
        rfr = RandomForestRegressor(n_estimators = 1000)
        rfr.set_params(**pickle.load(open("rfr.p", "rb" )))
        rfr.fit(X1, Y1)
        Y_rfr = rfr.predict(X2)
    
        # predict with GBT
        gbr = GradientBoostingRegressor(n_estimators=3000)
        gbr.set_params(**pickle.load(open("gbr.p", "rb" )))
        gbr.fit(X1, Y1)
        Y_gbr = gbr.predict(X2)
        
Ejemplo n.º 14
0
def standard_experiment(train_df, test_df, feature_names, args):

    train_df['set'] = "train"  # annotate
    test_df['set'] = "test"  # annotate

    # clip training set, if necessary
    if (0 < args.limit_data < len(train_df)):
        print "Clipping training set to %d comments" % args.limit_data
        train_df = train_df[:args.limit_data]

    # Split into X, y for regression
    target = args.target
    train_X = train_df.filter(feature_names).as_matrix().astype(
        np.float)  # training data
    train_y = train_df.filter([target]).as_matrix().astype(
        np.float)  # training labels
    test_X = test_df.filter(feature_names).as_matrix().astype(
        np.float)  # test data
    test_y = test_df.filter([target
                             ]).as_matrix().astype(np.float)  # ground truth

    # For compatibility, make 1D
    train_y = train_y.reshape((-1, ))
    test_y = test_y.reshape((-1, ))

    print "Training set: %d examples" % (train_X.shape[0], )
    print "Test set: %d examples" % (test_X.shape[0], )
    print "Selected %d features" % (len(feature_names), )
    print 'Features: %s' % (' '.join(feature_names))

    ##
    # Preprocessing: scale data, keep SVM happy
    scaler = preprocessing.StandardScaler()
    train_X = scaler.fit_transform(
        train_X)  # faster than fit, transform separately
    test_X = scaler.transform(test_X)

    if args.classifier != 'baseline':
        if args.stock_params:
            if args.classifier == 'svr':
                print "Initializing SVR model"
                clf = SVR(**STANDARD_PARAMS['svr'])
            elif args.classifier == 'rf':
                print "Initializing RandomForestRegressor model, seed=%d" % args.rfseed
                clf = RandomForestRegressor(random_state=args.rfseed,
                                            **STANDARD_PARAMS['rf'])
            elif args.classifier == 'elasticnet':
                print "Initializing ElasticNet model"
                clf = ElasticNet(max_iter=10000,
                                 **STANDARD_PARAMS['elasticnet'])
            else:
                raise ValueError("Invalid classifier '%s' specified." %
                                 args.classifier)

        else:
            ##
            # Run Grid Search / 10xv on training/dev set
            start = time.time()
            print "== Finding optimal classifier using Grid Search =="
            params, clf = train_optimal_classifier(train_X,
                                                   train_y,
                                                   classifier=args.classifier,
                                                   rfseed=args.rfseed,
                                                   quickmode=args.quickmode)
            print "Optimal parameters: " + json.dumps(params, indent=4)
            if hasattr(clf, "support_vectors_"):
                print 'Number of support vectors: %d' % len(
                    clf.support_vectors_)
            print "Took %.2f minutes to train" % ((time.time() - start) / 60.0)

        if hasattr(clf, 'random_state'):
            clf.set_params(random_state=args.rfseed)
        clf.fit(train_X, train_y)
        params = clf.get_params()

    ##
    # Set up evaluation function
    if args.ndcg_weight == 'target':
        favfunc = evaluation.fav_target  # score weighting
    else:
        favfunc = evaluation.fav_linear  # rank weighting

    max_K = 20
    eval_func = lambda data: evaluation.ndcg(data,
                                             max_K,
                                             target=args.ndcg_target,
                                             result_label=result_label,
                                             fav_func=favfunc)

    ##
    # Predict scores for training set
    result_label = "pred_%s" % args.target  # e.g. pred_score
    if args.classifier != 'baseline':
        train_pred = clf.predict(train_X)
    else:  # baseline: post order
        train_pred = -1 * train_df['position_rank']
    train_df[result_label] = train_pred

    print 'Performance on training data (NDCG with %s weighting)' % args.ndcg_weight
    # ndcg_train = eval_func(train_df)
    ndcg_train = eval_func(
        train_df[train_df.parent_nchildren >= args.min_posts_ndcg])
    for i, score in enumerate(ndcg_train, start=1):
        print '\tNDCG@%d: %.5f' % (i, score)
    print 'Karma MSE: %.5f' % mean_squared_error(train_y, train_pred)

    ##
    # Predict scores for test set
    if args.classifier != 'baseline':
        test_pred = clf.predict(test_X)
    else:  # baseline: post order
        test_pred = -1 * test_df['position_rank']
    test_df[result_label] = test_pred

    print 'Performance on test data (NDCG with %s weighting)' % args.ndcg_weight
    # ndcg_test = eval_func(test_df)
    ndcg_test = eval_func(
        test_df[test_df.parent_nchildren >= args.min_posts_ndcg])
    for i, score in enumerate(ndcg_test, start=1):
        print '\tNDCG@%d: %.5f' % (i, score)
    print 'Karma MSE: %.5f' % mean_squared_error(test_y, test_pred)

    ##
    # Save model to disk
    if args.savename and (args.classifier != 'baseline'):
        import cPickle as pickle
        saveas = args.savename + ".model.pkl"
        print "== Saving model as %s ==" % saveas
        with open(saveas, 'w') as f:
            pickle.dump(clf, f)

    ##
    # Get feature importance, if possible
    if args.savename and (args.classifier != 'baseline'):
        feature_importances = get_feature_importance(
            clf, args.classifier, feature_names=feature_names, sorted=True)
        saveas = args.savename + ".topfeatures.txt"
        print "== Recording top features to %s ==" % saveas
        # np.savetxt(saveas, feature_importances)
        # with open(saveas, 'w') as f:
        # json.dump(feature_importances, f, indent=2)
        with open(saveas, 'w') as f:
            maxlen = max([len(fname) for fname in feature_importances[0]])
            f.write("# Model: %s\n" % args.classifier)
            f.write("# Params: %s\n" % json.dumps(params))
            for fname, val in zip(*feature_importances):
                f.write("%s  %.06f\n" % (fname.ljust(maxlen), val))
            f.flush()

    ##
    # Save data to HDF5
    if args.savename:

        # Save score predictions
        fields = [
            "self_id", "parent_id", 'cid', 'sid', 'set', args.target,
            result_label
        ]
        if not args.ndcg_target in fields:
            fields.append(args.ndcg_target)
        saveas = args.savename + ".scores.h5"
        print "== Saving raw predictions as %s ==" % saveas
        outdf = pd.concat([train_df[fields], test_df[fields]],
                          ignore_index=True)
        outdf.to_hdf(saveas, 'data')

        if args.savefull:
            # Concatenate train, test
            df = pd.concat([train_df, test_df], ignore_index=True)

            print "== Exporting data to HDF5 =="
            saveas = args.savename + ".data.h5"
            df.to_hdf(saveas, "data")
            print "  [saved as %s]" % saveas

        # Save NDCG calculations
        dd = {
            'k': range(1, max_K + 1),
            'method': [args.ndcg_weight] * max_K,
            'ndcg_train': ndcg_train,
            'ndcg_test': ndcg_test
        }
        resdf = pd.DataFrame(dd)
        saveas = args.savename + ".results.csv"
        print "== Saving results to %s ==" % saveas
        resdf.to_csv(saveas)
Ejemplo n.º 15
0
y = range(10) # np.random.randn(n_samples)
#X = np.random.randn(n_samples, n_features)
#y = [
#    [1, 38],
#    [2, 59],
#    [3, 14],
#]
y = [1, 0, 0, 1, 1, 1, 1, 0, 0, 0]
X = [
    [1, 24],
    [3, 48],
    [3, 63],
    [1, 12],
    [1, 27],
    [1, 31],
    [1, 18],
    [3, 50],
    [3, 73],
    [3, 82],
]

y = [i for i in range(1, 10)]
X = [[i] for i in range(1, 10)]
print y
print X

clf = SVR(kernel='linear')#, C=1.0, epsilon=0.2)
print clf.fit(X, y)
print clf.predict([[i] for i in range(10)])
print clf.set_params(kernel='rbf')
pyplot.scatter(y_norm_test, pred_ridgeA_best, color='blue')
plt.xlabel('Test Values')
plt.ylabel('Predicted Values')
plt.title('Ridge Regression Scatter Plot Test Set (BRAAK A)')
pyplot.show()

#Linear Support Vector Regression for BRAAK12
from sklearn.svm import SVR
C = np.logspace(start=-5, stop=0, num=50)  #make this a smaller range
#epsilon = [2,4]
print(C)
C
svr_lin_A = SVR(kernel='linear')  #try rbf
MSE = []
for a in C:
    svr_lin_A.set_params(C=a)
    svr_lin_A.fit(X_norm_train, y_norm_train)
    pred_lin_A = svr_lin_A.predict(X_norm_test)
    MSEtemp = mean_squared_error(y_norm_test, pred_lin_A)
    MSE.append(MSEtemp)

ax = plt.gca()
fig, ax = plt.subplots(figsize=(15, 10))
ax.plot(C, MSE)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('Regularisation Parameter (C)')
plt.ylabel('Mean Squared Error (MSE)')
plt.title('Linear Support Vector Regression Test Set (BRAAK A)')
plt.show()
Ejemplo n.º 17
0
clf = SVR(kernel='rbf', epsilon=epsilon, C=3)

unchanged = 0
droplist = []
gamma_search = [2**x for x in range(-15, 3)]

wdcopy = whitedat.copy()
train_set = wdcopy.sample(frac=0.67, random_state=0)
test_set = wdcopy.drop(train_set.index)
X_train = train_set[train_set.columns[0:11]]
y_train = train_set[train_set.columns[11]]

X_test = test_set[test_set.columns[0:11]]
y_test = test_set[test_set.columns[11]]

clf.set_params(gamma=0.5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

L2 = mean_squared_error(y_test, y_pred)
print(L2)

while True:
    # First, search for optimal gamma
    wdcopy = whitedat.copy()
    train_set = wdcopy.sample(frac=0.67, random_state=0)
    test_set = wdcopy.drop(train_set.index)

    X_train = train_set[train_set.columns[0:11]]
    y_train = train_set[train_set.columns[11]]
Ejemplo n.º 18
0
pyplot.scatter(y_tmp_test_ridge, yhat_tmp_ridge, color='red')
plt.xlabel('Test Values  (MCI BRAAK56)')
plt.ylabel('Predicted Values (MCI BRAAK56)')
plt.title(
    'Ridge Regression Scatter Plot MCI BRAAK 56 Test Correlation Dataset')
pyplot.show()

#RBF Support Vector Machine for MCI BRAAK 12
#C = np.logspace(start = -5, stop = 0, num = 70 )
#this is a good value for MCI BRAAK56 but not for MCI 12
C = np.logspace(start=-3, stop=5, num=40)
svr_mci_rbf = SVR(kernel='rbf', gamma='scale')
MSE = []
for a in C:
    svr_mci_rbf.set_params(C=a)
    svr_mci_rbf.fit(X_norm_train, y_norm_mci_train)
    pred_mci_rbf = svr_mci_rbf.predict(X_norm_test)
    MSEtemp = mean_squared_error(y_norm_mci_test, pred_mci_rbf)
    MSE.append(MSEtemp)

ax = plt.gca()

ax.plot(C, MSE)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('Regularisation Parameter C')
plt.ylabel('Mean Squared Error')
plt.title('RBF Support Vector Regression MCI Braak 56')
plt.show()
#grid search for RBF SVR
model = SVR()

clist = [1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e10, 1e15]
epsilonlist = [0, .5, 1, 5, 10, 20, 30, 40, 50]
hplist = cartesian((clist, epsilonlist))

#hp_dict = {'C': paramlist1, 'epsilon': paramlist2}

#hp_list = list(ParameterGrid(hp_dict))

# loop over each set model hyperparameters

scores = []

for hp0 in hplist:
    model.set_params(C=hp0[0], epsilon=hp0[1])
    model.fit(X, y)

    score0 = abs(model.score(X, y))
    scores.append(score0)

    print(hp0[0], hp0[1], score0 * 200)
    plt.scatter(hp0[0], hp0[1], s=score0 * 200, c='b', alpha=0.3)
plot_setup(scales=['log', 'linear'], labels=['C', 'Epsilon'])
plt.plot()
'''   
for hp in hp_list:
    print(hp)
    model.set_params(**hp)
    model.fit(X, y)
    print(model.score(X, y))
Ejemplo n.º 20
0
regressor = SVR(verbose=10)

gamma_range = np.logspace(-5, 2, 10)
train_scores, valid_scores = validation_curve(regressor,
                                              X_train,
                                              y_train,
                                              "gamma",
                                              gamma_range,
                                              n_jobs=-1,
                                              scoring=rmsle_scorer)
valid_scores = [np.mean(s) for s in valid_scores]

# Take the alpha giving the highest validation score, and test it on test set
best_gamma = gamma_range[np.nanargmax(valid_scores)]
print("best gamma:", best_gamma)
regressor.set_params(gamma=best_gamma)

X_train, y_train = resample(X, y, n_samples=20000)
regressor.fit(X_train, y_train)

print("test")
# Since we can't load the whole dataset, do batch testing
batch_size = 5000
X_test, y_test = resample(X, y, n_samples=100000)
y_pred = np.ndarray((0, ))
for i in range(0, X_test.shape[0], batch_size):
    print(i)
    y_pred = np.hstack((y_pred, regressor.predict(X_test[i:i + batch_size])))
print("RMSLE =", root_mean_squared_log_error(y_test, y_pred))
Ejemplo n.º 21
0
class svReg(customRegressor):
    def __init__(self, in_df, zoning, utilities, frontage, qualPow):

        super(svReg, self).__init__()
        from lm_features import impute_shell
        ## Because we're currying in python now
        self._imputeVals = impute_shell(frontage=frontage,
                                        zoning=zoning,
                                        utilities=utilities,
                                        qualPow=qualPow)
        tempDF = self._imputeVals(in_df.copy())
        self.X = tempDF.drop(columns=["SalePrice"]).copy()
        self.y = np.log(tempDF.SalePrice.values.reshape(-1, 1))

        self.pipeline_X = self._make_pipe()
        self.pipeline_X.fit(self.X)
        self.pipeline_y = StandardScaler()
        self.pipeline_y.fit(self.y)

    def _rmOutliers(self, x, y):
        outliers = ((y > 4000) & (y < 5E5))
        out = x[~(outliers)]

        return out

    def _make_pipe(self):
        import svr_features as f
        nonePipeline = make_pipeline(
            SimpleImputer(strategy="constant", fill_value="None"),
            OneHotEncoder(drop="first"))
        zeroPipeline = make_pipeline(
            SimpleImputer(strategy="constant", fill_value=0),
            OneHotEncoder(drop="first", categories="auto"))
        scalePipeline = make_pipeline(
            SimpleImputer(strategy="constant", fill_value=0),
            PowerTransformer())

        regressionPipeline = ColumnTransformer(
            [("setNone", nonePipeline, f.fillNone),
             ("setZero", zeroPipeline, f.fillZeroCat),
             ("transformed", scalePipeline, f.fillZeroCont),
             ("dictImputed",
              make_pipeline(
                  self.dictImputer(f.imputeDict),
                  OneHotEncoder(drop="first")), list(f.imputeDict.keys())),
             ("bool", "passthrough", f.imputeBool),
             ("categoricalInts", "passthrough", f.cat_to_int),
             ("dropped", "drop", f.dropList)],
            remainder="drop")
        return make_pipeline(regressionPipeline, RobustScaler())

    def gridSearch(self, params, cv=5, njobs=-1, verbose=50):
        self._searchSpace = params

        piped_X = self._rmOutliers(self.X, self.y)
        piped_X = self.pipeline_X.transform(piped_X)
        piped_y = self.pipeline_y.transform(self.y)

        self._gridSearchObject = GridSearchCV(SVR(),
                                              params,
                                              cv=cv,
                                              scoring="neg_mean_squared_error",
                                              n_jobs=njobs,
                                              verbose=verbose)
        self._gridSearchObject.fit(piped_X, piped_y)

    def fitModel(self, params):
        self.model = SVR()
        self._params = params

        piped_X = self._rmOutliers(self.X, self.y)
        piped_X = self.pipeline_X.transform(piped_X)
        piped_y = self.pipeline_y.transform(self.y)

        self.model.set_params(**params)
        self.model.fit(piped_X, piped_y)

    def getTrainRsquared(self):
        piped_X = self._rmOutliers(self.X, self.y)
        piped_X = self.pipeline_X.transform(piped_X)
        piped_y = self.pipeline_y.transform(self.y)
        return self.model.score(piped_X, piped_y)
Ejemplo n.º 22
0
# In[23]:


X_train = train_data.iloc[:,6:].values
y_train = train_data.iloc[:,3].values
X_valid = valid_data.iloc[:,6:].values
y_valid = valid_data.iloc[:,3].values


# In[24]:


predictions = []
for param in model_params:
    model = SVR()
    model.set_params(**param)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_valid)
    predictions.append(y_pred)


# In[25]:


y_hat = list(map(lambda x: sum(x)/len(x), np.array(predictions).T))


# In[26]:

Ejemplo n.º 23
0
def fun_svm_fs(x, *args):
    X, y, flag, n_splits, random_seed = args
    clf = SVR(kernel='rbf', )
    n_samples, n_var = X.shape

    kernel = {
        2: 'linear',
        3: 'poly',
        0: 'rbf',
        1: 'sigmoid',
        4: 'laplacian',
        5: 'chi2'
    }

    #p={'C':x[0], 'kernel':kernel[int(round(x[2]))], 'gamma':x[1]}

    p = {
        'kernel': kernel[int(round(x[0]))],
        'degree': int(round(x[1])),
        'gamma': 'scale' if x[2] < 0 else x[2],
        'coef0': x[3],
        'C': x[4],
        'epsilon': x[5],
        'max_iter': 4000,
    }

    clf.set_params(**p)
    n_param = len(p)
    if len(x) <= n_param:
        ft = np.array([1 for i in range(n_var)])
        ft = np.where(ft > 0.5)
    else:
        ft = np.array([1 if k > 0.5 else 0 for k in x[2::]])
        ft = np.where(ft > 0.5)

    n_splits = n_splits
    try:
        #cv=KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)
        cv = KFold(n_splits=n_splits,
                   shuffle=True,
                   random_state=int(random_seed))
        #cv=StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=int(random_seed))
        y_p = cross_val_predict(clf, X, y, cv=cv, n_jobs=1)

        r = RMSE(y_p, y)
        r2 = MAPE(y_p, y)
        r3 = RRMSE(y_p, y)
        r4 = -r2_score(y_p, y)
        #r =  mean_squared_error(y,y_p)**0.5
        #r =  -accuracy_score(y,y_p)
        #r =  -precision_score(y,y_p)
        #r =  -f1_score(y,y_p,average='weighted')

    except:
        y_p = [None]
        r = 1e12

    #print (r,'\t',p,'\t',ft)
    #print (r)
    if flag == 'eval':
        return r
    else:
        clf.fit(X[:, ft].squeeze(), y)
        return {
            'Y_TRUE': y,
            'Y_PRED': y_p,
            'EST_PARAMS': p,
            'PARAMS': x,
            'EST_NAME': 'SVM',
            'ESTIMATOR': clf,
            'ACTIVE_VAR': ft,
            'DATA': X,
            'SEED': random_seed,
            'ERROR_TRAIN': {
                'RMSE': r,
                'MAPE': r2,
                'RRMSE': r3,
                'R2_SCORE': r4
            }
        }
Ejemplo n.º 24
0
def predict_COVID_part2(train_df, train_labels_df, test_feature):
    numberOFDaysToStartFrom = 50

    df2 = train_df[['dailly_cases']]

    casesCol = 'dailly_cases'
    casesList = []
    pastCase = 16
    for index in range(1, pastCase + 1):
        newColName = casesCol + '-' + str(index)
        casesList.append(newColName)
        df2[newColName] = np.nan
        for rowInd in range(numberOFDaysToStartFrom, len(df2)):
            df2.loc[rowInd, newColName] = int(df2.loc[rowInd - index,
                                                      casesCol])

    dataFrameIncreasing = df2[50:80]
    dataFrameDecreasing = df2[81:137]
    dataFrameConstant = df2[138:]

    svrModelIncreasing = SVR()
    svrModelDecreasing = SVR()
    svrModelConstant = SVR()

    svrModelIncreasing.set_params(
        **{
            'kernel': 'rbf',
            'degree': 1,
            'C': 9500,
            'gamma': 'scale',
            'coef0': 0.0,
            'tol': 0.001,
            'epsilon': 110
        })

    svrModelDecreasing.set_params(
        **{
            'kernel': 'rbf',
            'degree': 1,
            'C': 9500,
            'gamma': 'scale',
            'coef0': 0.0,
            'tol': 0.001,
            'epsilon': 110
        })

    svrModelConstant.set_params(
        **{
            'kernel': 'rbf',
            'degree': 1,
            'C': 9500,
            'gamma': 'scale',
            'coef0': 0.0,
            'tol': 0.001,
            'epsilon': 110
        })

    xTrainIncreasing = dataFrameIncreasing.drop(['dailly_cases'], 1)
    yTrainIncreasing = train_labels_df.iloc[50:80]
    yTrainIncreasing = yTrainIncreasing.drop(['day'], 1)

    xTrainDecreasing = dataFrameDecreasing.drop(['dailly_cases'], 1)
    yTrainDecreasing = train_labels_df.iloc[81:137]
    yTrainDecreasing = yTrainDecreasing.drop(['day'], 1)

    xTrainConstant = dataFrameConstant.drop(['dailly_cases'], 1)
    yTrainConstant = train_labels_df.iloc[138:]
    yTrainConstant = yTrainConstant.drop(['day'], 1)

    svrModelIncreasing.fit(xTrainIncreasing, yTrainIncreasing)

    svrModelDecreasing.fit(xTrainDecreasing, yTrainDecreasing)

    svrModelConstant.fit(xTrainConstant, yTrainConstant)

    testingForSeperateModels = df2.drop(['dailly_cases'], 1)
    testingForSeperateModels = testingForSeperateModels[
        numberOFDaysToStartFrom:]

    increasingModelPrediction = svrModelIncreasing.predict(
        testingForSeperateModels)
    decreasingModelPrediction = svrModelDecreasing.predict(
        testingForSeperateModels)
    constantModelPrediction = svrModelConstant.predict(
        testingForSeperateModels)

    combinedData = []
    for index in range(len(increasingModelPrediction)):
        newArray = []
        newArray.append(math.floor(increasingModelPrediction[index]))
        newArray.append(math.floor(decreasingModelPrediction[index]))
        newArray.append(math.floor(constantModelPrediction[index]))
        combinedData.append(newArray)

    xTrainCombinedModel = pd.DataFrame(combinedData,
                                       columns=[
                                           'increasingModelPrediction',
                                           'decreasingModelPrediction',
                                           'constantModelPrediction'
                                       ])
    yTrainCombinedModel = train_labels_df.iloc[numberOFDaysToStartFrom:]
    yTrainCombinedModel = yTrainCombinedModel.drop(['day'], 1)

    svrModelCombined = SVR()

    svrModelCombined.set_params(
        **{
            'kernel': 'rbf',
            'degree': 1,
            'C': 9500,
            'gamma': 'scale',
            'coef0': 0.0,
            'tol': 0.001,
            'epsilon': 110
        })

    svrModelCombined.fit(xTrainCombinedModel, yTrainCombinedModel)

    dataColumns = casesList

    finalPrediction = makePrediction(svrModelIncreasing, svrModelDecreasing,
                                     svrModelConstant, svrModelCombined,
                                     dataColumns, test_feature)

    return finalPrediction