Example #1
0
    def train_SVM(self, data):
        train, validacion = data
        x_tr, y_tr = train
        x_val, y_val = validacion
        #print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1]))
        #print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1]))

        print('Start training LinearSVR...')
        start_time = self.timer()

        svr = LinearSVR()
        svr.fit(x_tr, y_tr)
        print("The R2 is: {}".format(svr.score(x_tr, y_tr)))
        self.timer(start_time)

        print("Making prediction on validation data")
        y_val = np.expm1(y_val)
        y_val_pred = np.expm1(svr.predict(x_val))
        mae = mean_absolute_error(y_val, y_val_pred)
        print("El mean absolute error de es {}".format(mae))

        print('Saving model into a pickle')
        try:
            os.mkdir('pickles')
        except:
            pass

        with open('pickles/svrCV.pkl', 'wb') as f:
            pickle.dump(svr, f)

        print('Making prediction and saving into a csv')
        y_test = svr.predict(self.x_test)

        return y_test
Example #2
0
class SVMWrapper:
    def __init__(self,
                 c=1.0,
                 e=0.0,
                 loss="epsilon_insensitive",
                 dual=True,
                 max_iter=1000):
        self.regressor = LinearSVR(C=c,
                                   epsilon=e,
                                   loss=loss,
                                   dual=dual,
                                   max_iter=max_iter)
        self.training_time = None

    def train(self, x_train, y_train):
        start = time.perf_counter()
        self.regressor.fit(x_train, y_train)
        self.training_time = time.perf_counter() - start

    def score(self, x_test, y_test):
        return self.regressor.score(x_test, y_test)

    def predict(self, x_test):
        return self.regressor.predict(x_test)

    def predict_one(self, x_single):
        return self.regressor.predict(x_single)

    def get_training_time(self):
        if self.training_time is None:
            raise ValueError()
        else:
            return self.training_time
def LinearSVRRegressor(X_train, X_test, y_train, y_test):
    y_train1 = y_train[:, 0]
    y_train2 = y_train[:, 1]
    reg1 = LinearSVR(epsilon=0.001,
                     max_iter=5000,
                     C=3,
                     loss='squared_epsilon_insensitive')
    reg1.fit(X_train, y_train1)
    reg2 = LinearSVR(epsilon=0.001,
                     max_iter=5000,
                     C=3,
                     loss='squared_epsilon_insensitive')
    reg2.fit(X_train, y_train2)
    y_pred1 = reg1.predict(X=X_test)
    y_pred2 = reg2.predict(X=X_test)
    y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1)))

    printMetrics(y_true=y_test, y_pred=y_pred)

    val_metrics = getMetrics(y_true=y_test, y_pred=y_pred)
    y_pred1 = reg1.predict(X=X_train)
    y_pred2 = reg2.predict(X=X_train)
    y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1)))
    metrics = getMetrics(y_true=y_train, y_pred=y_pred)

    printMetrics(y_true=y_train, y_pred=y_pred)

    logSave(nameOfModel="LinearSVRRegressor",
            reg=[reg1, reg2],
            metrics=metrics,
            val_metrics=val_metrics)
Example #4
0
class LSVR:
    def __init__(self):
        super(LSVR, self).__init__()
        self.C = 0.1
        self.n_time = 5
        self.model = LinearSVR(C=self.C)

    def fit(self, train_x, train_y):
        self.model.fit(train_x, train_y)

    def predict(self, test_x):
        return self.model.predict(test_x)

    def eval(self, out_time, v_path, w_path):
        train_x, train_y, test_x, test_y = Helper.retrieve_data(
            n_time=5,
            out_time=out_time,
            train_pct=0.7,
            test_pct=0.2,
            v_path=v_path,
            w_path=w_path)
        train_x = np.squeeze(train_x.transpose(
            (0, 2, 1, 3))).reshape(-1, self.n_time)
        test_x = np.squeeze(test_x.transpose(
            (0, 2, 1, 3))).reshape(-1, self.n_time)
        train_y = train_y.reshape(-1)
        test_y = test_y.reshape(-1)
        print("LSVR Fitting...")
        self.model.fit(train_x, train_y)
        print("LSVR Fitted!")
        y_pred = self.model.predict(test_x)
        Helper.metrics(y_pred, test_y)
def LinearSVRRegressorGS(X_train, X_test, y_train, y_test):
    y_train1 = y_train[:, 0]
    y_train2 = y_train[:, 1]
    reg1 = LinearSVR()
    reg2 = LinearSVR()
    grid_values = {
        'epsilon': list(range(1, 3)) + [value * 0.01 for value in range(1, 3)],
        'C': [value * 0.01 for value in range(1, 3)],
        'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive']
    }

    grid_reg1 = GridSearchCV(
        reg1,
        param_grid=grid_values,
        scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'],
        refit='r2',
        n_jobs=-1,
        cv=2,
        verbose=100)
    grid_reg1.fit(X_train, y_train1)
    reg1 = grid_reg1.best_estimator_
    reg1.fit(X_train, y_train1)
    grid_reg2 = GridSearchCV(
        reg2,
        param_grid=grid_values,
        scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2'],
        refit='r2',
        n_jobs=-1,
        cv=2,
        verbose=100)
    grid_reg2.fit(X_train, y_train2)
    reg2 = grid_reg1.best_estimator_
    reg2.fit(X_train, y_train2)
    y_pred1 = reg1.predict(X=X_test)
    y_pred2 = reg2.predict(X=X_test)
    y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1)))

    printMetrics(y_true=y_test, y_pred=y_pred)

    val_metrics = getMetrics(y_true=y_test, y_pred=y_pred)
    y_pred1 = reg1.predict(X=X_train)
    y_pred2 = reg2.predict(X=X_train)
    y_pred = np.hstack((y_pred1.reshape(-1, 1), y_pred2.reshape(-1, 1)))
    metrics = getMetrics(y_true=y_train, y_pred=y_pred)

    printMetrics(y_true=y_train, y_pred=y_pred)

    best_params1: dict = grid_reg1.best_params_
    best_params2: dict = grid_reg2.best_params_
    best_params = {}
    for key in best_params1.keys():
        best_params[key] = [best_params1[key], best_params2[key]]
    saveBestParams(nameOfModel="LinearSVRRegressorGS", best_params=best_params)
    logSave(nameOfModel="LinearSVRRegressorGS",
            reg=[reg1, reg2],
            metrics=metrics,
            val_metrics=val_metrics)
Example #6
0
	def fitSVR(self, X, Y, name, lastX = None):
		if not hasattr(self, name):
			SVR = []
			setattr(self, name, SVR)
		else:
			SVR = getattr(self, name)

#		if "ridge_alpha" in self.args:
#			alpha = self.args['ridge_alpha']
#		else:
		epsilon_options = [0, 0.1, 10, 100]
		C_options = [0.1, 10, 100]

		Xselect = 30000

		kf = KFold(n_splits=5, shuffle=True)
		for i1, i2 in kf.split(X):
			train_index, test_index = i1[:Xselect], i2
			break

		bestscore = 3
		bestarg = None
		for epsilon in epsilon_options:
			for C in C_options:
				logging.info("SVR trying %f %f", epsilon, C)
				model = LinearSVR(epsilon=epsilon, C=C)
				model.fit(X[train_index], Y[train_index][:, 24])
				if lastX is None:
					predY = model.predict(X[test_index])
					score = calSMAPE1(Y[test_index][:, 24], predY)
				else:
					predY = lastX[test_index][:, 0] + model.predict(X[test_index])
					score = calSMAPE1(lastX[test_index][:, 0] + Y[test_index][:, 24], predY)
				if score < bestscore:
					bestscore = score
					bestarg = (epsilon, C)
				logging.info("SVR try %f %f, score %f", epsilon, C, score)
		epsilon, C = bestarg
		logging.info("SVR best %f %f, bestscore %f", epsilon, C, bestscore)

		global SVRargs
		SVRargs = (X[train_index], Y[train_index], epsilon, C)

		for idx in self.divide(list(range(len(SVR), Y.shape[1])), 18):
			with mp.Pool(6) as pool:
				SVR += pool.map(train_SVR, idx)
			logging.info("SVR group %d", idx[0])
			self.saveModule(name, False)

		logging.info("SVR ok")
Example #7
0
    def test_linear_svr_evaluation(self):
        """
        Check that the evaluation results are the same in scikit learn and coremltools
        """
        ARGS = [
            {},
            {"C": 0.5, "epsilon": 0.25},
            {"dual": False, "loss": "squared_epsilon_insensitive"},
            {"tol": 0.005},
            {"fit_intercept": False},
            {"intercept_scaling": 1.5},
        ]

        input_names = self.scikit_data.feature_names
        df = pd.DataFrame(self.scikit_data.data, columns=input_names)

        for cur_args in ARGS:
            print(cur_args)
            cur_model = LinearSVR(**cur_args)
            cur_model.fit(self.scikit_data["data"], self.scikit_data["target"])
            spec = convert(cur_model, input_names, "target")

            df["prediction"] = cur_model.predict(self.scikit_data.data)

            metrics = evaluate_regressor(spec, df)
            self.assertAlmostEquals(metrics["max_error"], 0)
Example #8
0
def test_svm_model(train_X, train_y, dev_X, dev_y):
    print('Testing svm model...')
    from sklearn.svm import LinearSVR
    clf = LinearSVR()
    clf.fit(train_X, train_y)
    pred_y = clf.predict(dev_X)
    print('RMSE: {}'.format(math.sqrt(mean_squared_error(dev_y, pred_y))))
Example #9
0
def try_Cs(X, y, cv, Cs):
    results = []

    for C in Cs:
        t0 = time()
        scores = []

        for train_idx, val_idx in cv:
            svm = LinearSVR(C=C, loss='squared_epsilon_insensitive', dual=False, random_state=1)
            svm.fit(X[train_idx], y[train_idx])

            y_pred = svm.predict(X[val_idx])
            y_pred[y_pred < 0] = 0.0
            y_pred[y_pred > 1] = 1.0

            rmse = mean_squared_error(y[val_idx], y_pred)
            scores.append(rmse)

        m = np.mean(scores)
        s = np.std(scores)

        print('C=%s, took %.3fs, mse=%.3f+-%.3f' % (C, time() - t0, m, s))
        
        results.append((m.round(3), s, C))
    
    _, _, best_C = min(results)
    return best_C
Example #10
0
class LinearSVRPrim(primitive):
    def __init__(self, random_state=0):
        super(LinearSVRPrim, self).__init__(name='LinearSVR')
        self.hyperparams = []
        self.type = 'Regressor'
        self.description = "We make use of the epsilon-insensitive loss, i.e. errors of less than epsilon are ignored. This is the form that is directly optimized by LinearSVR."
        self.hyperparams_run = {'default': True}
        self.random_state = random_state
        self.model = LinearSVR()
        self.accept_type = 'c_r'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        # data = handle_data(data)
        return True

    def fit(self, data):
        data = handle_data(data)
        self.model.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        output['predictions'] = self.model.predict(output['X'])
        output['X'] = pd.DataFrame(output['predictions'], columns=[self.name+"Pred"])
        final_output = {0: output}
        return final_output
Example #11
0
def mse_of_linear_svr(X, y, epsilon):
    """
        Compute the mean square error of a linear SVR predictor with hyperparameter epsilon.
        As a model, use LinearSVR library to train a linear SVR predictor.
        Set its epsilon hyperparameter to the value of the epsilon argument,
            and its random state to 5.

        Split the dataset into training dataset, test dataset, training labels, and test labels;
            with 0.2 as the test size and 5 as its random state.
        Use StandardScaler to scale the both datasets.

        Fit and test the model, and return the mean square error on the test dataset.

        Args:
            X - (n, d) numpy array of the dataset of n sample points each with d features
            y - (n, ) numpy array of the label values for each sample point
            epsilon - a scalar of the hyperparameter epsilon of a linear SVR predictor
        Returns:
            mse - a scalar of the mean square error of the test dataset
    """
    # Write your code here
    model_linearSVR = LinearSVR(epsilon = epsilon,random_state=5)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
    scaler = StandardScaler()
    scaler_x_train = scaler.fit_transform(X_train)
    scaler_x_test = scaler.transform(X_test)
    model_linearSVR.fit(scaler_x_train, y_train)
    y_pred = model_linearSVR.predict(scaler_x_test)
    a = mean_squared_error(y_test, y_pred)
    return a
Example #12
0
def train_svr(X, y, plot=False, linear=False):
    """
    Trains a SVR Model. If the parameter linear is given, trains a Linear SVR.
    :param X: X of the current dataset
    :param y: target of the current dataset
    :param plot: either true or false. Controls if plots are shown while training this model.
    :param linear: either true or false. Controls if the trained model will be a Linear SVR or  Epsilon-SVR.
    :return: trained SVR model
    """
    print("Training SVR Model")
    if linear:
        estimator = LinearSVR()
        model = LinearSVR()
    else:
        estimator = SVR()
        model = SVR()
    model_name = type(estimator).__name__
    estimated_test_error = estimate_test_error(estimator, X, y)
    print("Estimated test error for {} model : {}".format(
        model_name, estimated_test_error))
    model.fit(X, y)
    y_pred = model.predict(X)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    print("Training error for {} model : {}".format(model_name, rmse))
    if plot:
        plot_residuals(y_pred, y, model_name)
    return model
def GlobalRegression(local_binary_features, targets):
    t1=time.time()
    updates=np.zeros((len(targets), param_landmark_num, 2))
    svrs=[]
    for i in range(param_landmark_num):
        # dx
        svr_x=LinearSVR(C=1./len(targets), dual=True, loss='squared_epsilon_insensitive', epsilon=0.0001)
        svr_x.fit(local_binary_features, targets[:, i, 0])
        updates[:, i, 0]=svr_x.predict(local_binary_features)
        # dy
        svr_y=LinearSVR(C=1./len(targets), dual=True, loss='squared_epsilon_insensitive', epsilon=0.0001)
        svr_y.fit(local_binary_features, targets[:, i, 1])
        updates[:, i, 1]=svr_y.predict(local_binary_features)
        svrs.append([svr_x, svr_y])
    print('Global Regression use:', time.time()-t1, 's')
    return updates, svrs
Example #14
0
def test_data_truth():
    n = 100
    d = 10
    strRel = 2
    generator = check_random_state(1337)
    X, Y = genRegressionData(
        n_samples=n,
        n_features=d,
        n_redundant=0,
        n_strel=strRel,
        n_repeated=0,
        random_state=generator,
        noise=0,
    )
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        random_state=generator)

    linsvr = LinearSVR()
    linsvr.fit(X_train, y_train)
    pred = linsvr.predict(X_test)
    r2 = r2_score(y_test, pred)

    assert r2 > 0.9
Example #15
0
    def test_linear_svr_evaluation(self):
        """
        Check that the evaluation results are the same in scikit learn and coremltools
        """
        ARGS = [{}, {
            'C': 0.5,
            'epsilon': 0.25
        }, {
            'dual': False,
            'loss': 'squared_epsilon_insensitive'
        }, {
            'tol': 0.005
        }, {
            'fit_intercept': False
        }, {
            'intercept_scaling': 1.5
        }]

        input_names = self.scikit_data.feature_names
        df = pd.DataFrame(self.scikit_data.data, columns=input_names)

        for cur_args in ARGS:
            print(cur_args)
            cur_model = LinearSVR(**cur_args)
            cur_model.fit(self.scikit_data['data'], self.scikit_data['target'])
            spec = convert(cur_model, input_names, 'target')

            df['prediction'] = cur_model.predict(self.scikit_data.data)

            metrics = evaluate_regressor(spec, df)
            self.assertAlmostEquals(metrics['max_error'], 0)
Example #16
0
class TestLinearSVRIntegration(TestCase):
    def setUp(self):
        df = pd.read_csv(path.join(BASE_DIR, '../models/categorical-test.csv'))
        Xte = df.iloc[:, 1:]
        Xenc = pd.get_dummies(Xte, prefix_sep='')
        yte = df.iloc[:, 0]
        self.test = (Xte, yte)
        self.enc = (Xenc, yte)

        pmml = path.join(BASE_DIR, '../models/linear-model-lm.pmml')
        self.clf = PMMLLinearSVR(pmml)

        self.ref = LinearSVR()
        self.ref.fit(Xenc, yte == 'Yes')

    def test_invalid_model(self):
        with self.assertRaises(Exception) as cm:
            PMMLLinearSVR(pmml=StringIO("""
              <PMML xmlns="http://www.dmg.org/PMML-4_3" version="4.3">
                <DataDictionary>
                  <DataField name="Class" optype="categorical" dataType="string">
                    <Value value="setosa"/>
                    <Value value="versicolor"/>
                    <Value value="virginica"/>
                  </DataField>
                </DataDictionary>
                <MiningSchema>
                  <MiningField name="Class" usageType="target"/>
                </MiningSchema>
              </PMML>
              """))

        assert str(
            cm.exception) == 'PMML model does not contain RegressionModel.'

    def test_fit_exception(self):
        with self.assertRaises(Exception) as cm:
            self.clf.fit(np.array([[]]), np.array([]))

        assert str(cm.exception) == 'Not supported.'

    def test_more_tags(self):
        assert self.clf._more_tags() == LinearSVR()._more_tags()

    def test_sklearn2pmml(self):
        # Export to PMML
        pipeline = PMMLPipeline([("classifier", self.ref)])
        pipeline.fit(self.enc[0], self.enc[1] == 'Yes')
        sklearn2pmml(pipeline, "svm-sklearn2pmml.pmml", with_repr=True)

        try:
            # Import PMML
            model = PMMLLinearSVR(pmml='svm-sklearn2pmml.pmml')

            # Verify classification
            Xenc, _ = self.enc
            assert np.allclose(self.ref.predict(Xenc), model.predict(Xenc))

        finally:
            remove("svm-sklearn2pmml.pmml")
Example #17
0
def linear_svm_regression():
    np.random.seed(42)
    m = 50
    X = 2 * np.random.rand(m, 1)
    y = (4 + 3 * X + np.random.randn(m, 1)).ravel()
    svm_reg1 = LinearSVR(epsilon=1.5, random_state=42)
    svm_reg2 = LinearSVR(epsilon=0.5, random_state=42)
    svm_reg1.fit(X, y)
    svm_reg2.fit(X, y)
    svm_reg1.support_ = find_support_vectors(svm_reg1, X, y)
    svm_reg2.support_ = find_support_vectors(svm_reg2, X, y)

    eps_x1 = 1
    eps_y_pred = svm_reg1.predict([[eps_x1]])

    plt.figure(figsize=(9, 4))
    plt.subplot(121)
    plot_svm_regression(svm_reg1, X, y, [0, 2, 3, 11])
    plt.title(r"$\epsilon = {}$".format(svm_reg1.epsilon), fontsize=18)
    plt.ylabel(r"$y$", fontsize=18, rotation=0)
    # plt.plot([eps_x1, eps_x1], [eps_y_pred, eps_y_pred - svm_reg1.epsilon], "k-", linewidth=2)
    plt.annotate(
        '', xy=(eps_x1, eps_y_pred), xycoords='data',
        xytext=(eps_x1, eps_y_pred - svm_reg1.epsilon),
        textcoords='data', arrowprops={'arrowstyle': '<->', 'linewidth': 1.5}
    )
    plt.text(0.91, 5.6, r"$\epsilon$", fontsize=20)
    plt.subplot(122)
    plot_svm_regression(svm_reg2, X, y, [0, 2, 3, 11])
    plt.title(r"$\epsilon = {}$".format(svm_reg2.epsilon), fontsize=18)
    plt.show()
Example #18
0
def outlier_linearSVR_detector(feature, target, residual_threshold, return_index = False):
	"""
	this function detect the outlier by using the LinearSVR with linear kernel
	with the fitted coefficient
	"""
	target = (np.array(target)).flatten()
	residual_threshold = (np.max(target) -np.min(target))*residual_threshold
	regr = LinearSVR(random_state=1, dual=True, epsilon=0.0)
	regr.fit(feature, target)
	
	predict_data = regr.predict(feature)
	i=0
	num_of_outlier = 0
	outlier_index = []
	for x in predict_data:
		delta = x-target[i]
		if abs(delta) > residual_threshold:
			num_of_outlier = num_of_outlier + 1	
			outlier_index.append(i)
		i=i+1
	slope = regr.coef_[0]
	
	if return_index is False:
		return (num_of_outlier, slope)
	else:
		return outlier_index
Example #19
0
    async def do_run_async(self):
        # Generate some non-linear data based on a quadratic equation
        m = 100
        X = 6 * np.random.uniform(1, 5, (m, 1)) - 3
        y = 0.5 * X**2 + X + 2 + np.random.uniform(1, 5, (m, 1))

        plt.plot(X, y, ".")
        plt.show()

        # To tackle nonlinear regression tasks, you can use a kernelized SVM model
        svm_poly_reg = SVR(kernel="poly", degree=2, C=100, epsilon=0.1)
        svm_poly_reg.fit(X, y)

        rand_index = np.random.randint(0, 99)
        x = X[rand_index, ]
        print("Prediction for:", x)
        print(svm_poly_reg.predict([x]))
        print("Label:", y[rand_index, ])

        # ... or just use the Linear SVR algorithm with polynomial features
        polly = PolynomialFeatures(
            degree=2)  # Polynomial degree is usually number of features + 1?
        X_tr = polly.fit_transform(X)

        svm_reg = LinearSVR(epsilon=1.5)
        svm_reg.fit(X_tr, y)

        rand_index = np.random.randint(0, 99)
        x = X_tr[rand_index, ]
        print("Prediction for:", x)
        print(svm_reg.predict([x]))
        print("Label:", y[rand_index, ])
Example #20
0
def build_svr(params=None):
    train_df, test_df = load_data()
    combined_df = pd.concat((train_df.loc[:, 'MSSubClass':'SaleCondition'],
                             test_df.loc[:, 'MSSubClass':'SaleCondition']))

    # feature engineering
    config_categorical_features(combined_df)
    # combined_df = extract_common_features(combined_df)
    log_transform_features(combined_df)
    combined_df = normalize_numerical_features(combined_df)
    combined_df = one_hot_encoding(combined_df)
    missing_value_fill(combined_df)

    X_train = combined_df[:train_df.shape[0]]
    X_test = combined_df[train_df.shape[0]:]
    y = np.log1p(train_df["SalePrice"])

    if params is None:
        params = tuning(X_train, y)

    # model training
    model = LinearSVR(**params)
    model.fit(X_train, y)
    print("cross_validation_rmse:", np.mean(np.sqrt(-cross_val_score(model, X_train, y, cv=3, scoring="neg_mean_squared_error"))))

    # model prediction
    lasso_preds = np.expm1(model.predict(X_test))
    solution = pd.DataFrame({"id": test_df.Id, "SalePrice": lasso_preds})
    solution.to_csv("./house_price/submission_svr_v1.csv", index=False)
Example #21
0
def svr_C(train_features, train_labels, test_features, test_labels, name):
    """
    Plot C against the accuracy.
    """
    sns.set()
    sns.set_style("ticks")

    train_results = []
    test_results = []

    c_values = np.linspace(1e-4, 1, 10)

    train_scaled, scaler = ml_funcs.apply_scaling(train_features, 'SVR', name, save_scaler=False)
    test_scaled = scaler.transform(test_features)

    for c_val in c_values:
        print("C:", c_val)

        svr = LinearSVR(C=c_val, max_iter=2000, random_state=0)
        svr.fit(train_scaled, train_labels)
        predict_train = svr.predict(train_scaled)

        # Accuracy of training data (mean absolute percentage error)
        accuracy_train = compute_accuracy(predict_train, train_labels)
        train_results.append(accuracy_train)

        predict_test = svr.predict(test_scaled)

        # Accuracy for test data.
        accuracy_test = compute_accuracy(predict_test, test_labels)
        test_results.append(accuracy_test)

    fig = plt.figure(figsize=(10, 6))
    sns.lineplot(x=c_values, y=train_results, label='Train')
    sns.lineplot(x=c_values, y=test_results, label='Test')
    plt.legend(frameon=False, loc='lower right')
    plt.xlabel('C')
    plt.ylabel('Accuracy score [%]')

    fig.tight_layout()
    sns.despine()

    if generate_plots.directory_exists("./Figures"):
        plt.savefig("./Figures/C_" + name + ".pdf", bbox_inches="tight", dpi=300,
                    transparent=True)
    else:
        print("Directory: ./Figures does not exist!")
Example #22
0
def linearSVR(data):
    X = data.drop(["id", "date", "price","long","lat", "zipcode","yr_renovated", "sqft_above", "sqft_basement"], axis=1)
    y = data["price"]
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.10, random_state=42)
    svr = LinearSVR(random_state=42)
    svr.fit(X_train, y_train)
    y_predict = svr.predict(X_test)
    print "r2-score for LinearSVR: %f" % r2_score(y_test, y_predict)
Example #23
0
def innerfold_svr(x_test, y_test, x_train, y_train):
    svr_rbf = LinearSVR(random_state=4)
    svr_rbf.fit(x_train, y_train)
    pred_y = svr_rbf.predict(x_test)
    mse = mean_squared_error(y_test, pred_y)
    rmse = math.sqrt(mse)
    print rmse
    return rmse
Example #24
0
def regressor_test(complete,incomplete,years):
    kn_errors = []
    linear_errors = []
    svr_errors = []    
    
    for i in years[0]:
            
        X_train, X_test, y_train, y_test = train_test_split(complete.loc[:,complete.columns != i].values,
                                                            complete.loc[:,i].values, test_size = 0.2, random_state = 0)
        
        regressor1 = KNeighborsRegressor(2, 
                                       weights ='distance', 
                                       metric = 'euclidean')
        regressor2= LinearRegression()
        regressor3=LinearSVR()
        
        
        trained_model1 = regressor1.fit(X_train, 
                                 y_train)
        trained_model2 = regressor2.fit(X_train, 
                                 y_train)
        trained_model3 = regressor3.fit(X_train, 
                                 y_train)  
        
        incomplete_2 = deepcopy(incomplete)
        incomplete_2.loc[:, incomplete.columns != i] = incomplete_2.loc[:, 
                                incomplete.columns != i].apply(lambda row: row.fillna(row.mean()), axis=1)

        y_pred1 = regressor1.predict(X_test)
        y_pred2 = regressor2.predict(X_test)
        y_pred3 = regressor3.predict(X_test)
        
        
        kn_errors.append(mean_squared_error(y_test, y_pred1))
        linear_errors.append(mean_squared_error(y_test, y_pred2))
        svr_errors.append(mean_squared_error(y_test, y_pred3))
        
        
        #Test for checking the best model 
    MSE= []

    for i in range(0, len(complete.loc[:,'2007':'2017'].columns)):
        l = []
        l.extend((kn_errors[i], linear_errors[i], svr_errors[i]))
        
        if min(l) == kn_errors[i]:
            MSE.append("KNN")
        elif min(l) == linear_errors[i]:
            MSE.append("Linear")
        elif min(l) == svr_errors[i]:
            MSE.append("SVR")

    
    print("KNN =",MSE.count("KNN"),'\nLinear =',MSE.count("Linear") ,'\nSVR =',MSE.count("SVR"))


    return max(set(MSE), key = MSE.count)
def svm_regressor(train_data, train_label, test_data, test_label, parameters):
    min_error = 10000000000
    error = []

    # tuned_parameters = [{'kernel': ['rbf'], 'gamma': [100,10,1,1e-1, 1e-2,],
    #                      'C': [0.1,1, 10, 100], 'epsilon':[ 100, 1000, 10000,1e6,1e8]}]
    #                     # {'kernel': ['linear'], 'C': [1, 10, 100, 1000], 'epsilon': [1, 10,100,1000]},
    #                     # {'kernel':['poly'],'gamma': [1e-3, 1e-4],
    #                     #  'C': [1, 10, 100, 1000], 'epsilon':[ 1, 10, 100,1000]}]
    # # {'kernel': ['linear'], 'C': [1, 10, 100, 1000], 'epsilon': [1e-2, 1e-1, 1, 10]}
    # clf = GridSearchCV(SVR(), tuned_parameters, cv=5,verbose=1,n_jobs=-1)
    # clf.fit(train_data, train_label)
    # print clf.best_params_
    # print clf.cv_results_
    # tuned_parameters = [{'C': [1e-2,1e-1,1, 10, 100], 'epsilon': [1, 10, 100, 1000,10000]}]
    # clf = GridSearchCV(LinearSVR(random_state=random_state), tuned_parameters, cv=5, verbose=1, n_jobs=-1)
    # clf.fit(train_data, train_label)
    # print clf.best_params_
    # print clf.cv_results_

    # regr = SVR(kernel='rbf', gamma=0.01,C=100)
    # regr.fit(train_data, train_label)
    # score = regr.score(test_data, test_label)
    # predict = regr.predict(test_data)
    # predict = map(lambda x: [x], predict)
    # predict = np.array(predict)
    # mse = MSE(np.array(predict), test_label)
    # if (mse[0] < min_error):
    #     min_error = mse[0]
    # print mse[0]
    regr = LinearSVR(C=0.001, epsilon=1, random_state=random_state)
    regr.fit(train_data, train_label)
    score = regr.score(test_data, test_label)
    predict = regr.predict(test_data)
    predict = map(lambda x: [x], predict)
    predict = np.array(predict)
    mse = MSE(np.array(predict), test_label)
    if (mse[0] < min_error):
        min_error = mse[0]

    print 'MSE ' + parameters + ' ' + str(mse[0])

    df = pd.Series(predict.flatten(), index=test_label.index)
    price = train_label.append(test_label)
    plt.title('SVM Regression on ' + parameters)
    plt.plot(price[1000:-1], label='actual price')
    plt.plot(df, label='predicted price')
    plt.legend(loc='lower right')
    plt.xlabel('Dates')
    plt.ylabel('Price')
    # plt.show()
    directory = './svm/'
    if not os.path.exists(directory):
        os.makedirs(directory)
    plt.savefig(directory + parameters + '.png')
    plt.close()
    return
def linear_svr_pred(X_train, Y_train):
    """
    Train a linear model with Support Vector Regression
    """

    svr_model = LinearSVR(random_state=RANDOM_STATE)
    svr_model.fit(X_train, Y_train)
    Y_pred = svr_model.predict(X_train)
    return Y_pred
Example #27
0
class LibLinear_SVR:
    # Liblinear is not deterministic as it uses a RNG inside
    def __init__(self,
                 epsilon,
                 loss,
                 dual,
                 tol,
                 C,
                 fit_intercept,
                 intercept_scaling,
                 random_state=None):
        self.epsilon = epsilon
        self.loss = loss
        self.dual = dual
        self.tol = tol
        self.C = C
        self.fit_intercept = fit_intercept
        self.intercept_scaling = intercept_scaling
        self.random_state = random_state
        self.estimator = None

    def fit(self, X, Y):
        from sklearn.svm import LinearSVR

        # In case of nested loss
        if isinstance(self.loss, dict):
            combination = self.loss
            self.loss = combination['loss']
            self.dual = combination['dual']

        self.epsilon = float(self.epsilon)
        self.C = float(self.C)
        self.tol = float(self.tol)

        self.dual = check_for_bool(self.dual)

        self.fit_intercept = check_for_bool(self.fit_intercept)

        self.intercept_scaling = float(self.intercept_scaling)

        self.estimator = LinearSVR(epsilon=self.epsilon,
                                   loss=self.loss,
                                   dual=self.dual,
                                   tol=self.tol,
                                   C=self.C,
                                   fit_intercept=self.fit_intercept,
                                   intercept_scaling=self.intercept_scaling,
                                   random_state=self.random_state)
        self.estimator.fit(X, Y)
        return self

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)
Example #28
0
class   SVMRegression(object):

    def __init__(self, X, y, epsilon, **kwargs):

        self.X = X
        self.y = y
        self.epsilon = epsilon
        self.model = LinearSVR(epsilon=epsilon, **kwargs) 

    def train_model(self):
        self.model.fit(self.X, self.y)
        self.epsilon = self.model.epsilon
        self.y_pred = self.model.predict(self.X)

    def get_support_vectors(self):
        """
        Get the index of points which is off the street
        """
        self.if_off_margin = (np.abs(self.y - self.y_pred) >= self.epsilon)
        self.idx_support_ = np.argwhere(self.if_off_margin)
        return self.idx_support_

    def model_predict(self, x_new):
        return self.model.predict(x_new)

    def plot_svm_regression(self, axes):
        """
        Plot SVM Regression
        """
        x_new = np.linspace(axes[0], axes[1], 100).reshape(100, 1)
        y_estimate = self.model.predict(x_new)

        plt.plot(x_new, y_estimate, "k-", linewidth=2, label="Prediction of y")
        plt.plot(x_new, y_estimate + self.epsilon, "r--", label="Upper Bound")
        plt.plot(x_new, y_estimate - self.epsilon, "g--", label="Lower Bound")
        
        plt.scatter(self.X[self.idx_support_], self.y[self.idx_support_], s=180, facecolors='#FFAAAA')
        plt.plot(self.X, self.y, "bo")
        plt.xlabel(r"$x_1$", fontsize=18)
        plt.ylabel(r"$y$", fontsize=18, rotation=0)
        plt.legend(loc="best", fontsize=18)
        plt.axis(axes)
def predict_SVM():
    svclassifier = LinearSVR(random_state=50,
                             max_iter=100000,
                             epsilon=0,
                             tol=1e-9)
    svclassifier.fit(X_train_csr.todense(), y_train_1)
    scv_test_predict = svclassifier.predict(X_test_csr.todense())
    print(scv_test_predict)
    print(classification_report(y_test_1, np.rint(scv_test_predict)))
    print("RMSE for Neural Random SVR Classifier",
          sqrt(mean_squared_error(y_test_1, np.rint(scv_test_predict))))
def Linear_SVR(Xtrain, Xtest, ytrain, ytest):
    cv_scores = []
    parameters = [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5]
    for i in parameters:
        clf = LinearSVR(loss='squared_epsilon_insensitive', C=i)
        clf.fit(Xtrain, ytrain)
        y_pred = clf.predict(Xtest)
        #print clf.score(y_test, y_pred)
        cv_scores.append(metrics.r2_score(ytest, y_pred))
    print("LinearSVR")
    print sum(cv_scores) / float(len(cv_scores))
Example #31
0
def main():
    # 数据加载
    train_data = pd.read_csv('d_train_20180102.csv', encoding='GBK')
    train_bloods = train_data['血糖'].astype(float)
    test_data = pd.read_csv('d_test_A_20180102.csv', encoding='GBK')
    test_bloods = pd.read_csv('d_answer_a_20180128.csv',
                              encoding='GBK').astype(float)
    train_data = train_data.drop(['id', '体检日期'], axis=1)
    test_data = test_data.drop(['id', '体检日期'], axis=1)
    train_data = train_data.drop(
        ['乙肝表面抗原', '乙肝表面抗体', '乙肝e抗原', '乙肝e抗体', '乙肝核心抗体', '血糖'], axis=1)
    test_data = test_data.drop(
        ['乙肝表面抗原', '乙肝表面抗体', '乙肝e抗原', '乙肝e抗体', '乙肝核心抗体'], axis=1)

    label = train_data.columns
    encoder = LabelEncoder()
    train_data['性别'] = encoder.fit_transform(train_data['性别'])
    test_data['性别'] = encoder.fit_transform(test_data['性别'])
    train_data.astype(float)
    test_data.astype(float)
    for i in label:
        train_data[i].fillna(train_data[i].mean(), inplace=True)
        test_data[i].fillna(test_data[i].mean(), inplace=True)

    scaler = StandardScaler()
    train_data = pd.DataFrame(scaler.fit_transform(train_data))  # 均值归一化
    test_data = pd.DataFrame(scaler.fit_transform(test_data))  # 均值归一化/

    # 回归得用线性svr
    lin_svr = LinearSVR(random_state=42)
    lin_svr.fit(train_data, train_bloods)
    predict_bloods = lin_svr.predict(test_data)
    mse = mean_squared_error(test_bloods, predict_bloods)
    print(mse)
    print(np.sqrt(mse))
    param_distributions = {
        'gamma': reciprocal([0.001, 0.1]),
        # 'C': uniform(1,10)
        'C': [uniform(1, 10), uniform(10, 1)]
    }
    rnd_search_cv = RandomizedSearchCV(SVR(),
                                       param_distributions,
                                       n_iter=4,
                                       verbose=2,
                                       cv=3,
                                       random_state=42)
    train_bloods = pd.DataFrame(train_bloods)
    rnd_search_cv.fit(train_data, train_bloods)
    y_pred = rnd_search_cv.best_estimator_.predict(train_data)
    mse = mean_squared_error(train_bloods, y_pred)
    print(np.sqrt(mse))  # 0.5727524770785356
    y_pred = rnd_search_cv.best_estimator_.predict(test_data)
    mse = mean_squared_error(test_bloods, y_pred)
    print(np.sqrt(mse))  # 0.592916838552874
Example #32
0
class SVRR(object):

    def __init__(self, C):
        self.regression = LinearSVR(C=C)

    def fit(self, xs, ys):
        xs = xs.values
        ys = ys['y']
        self.regression.fit(xs, ys)

    def predict(self, xs):
        xs = xs.values
        ys = self.regression.predict(xs)
        return ys
    class LinearSVRPermuteCoef:
        def __init__(self, **kwargs):
            self.model = LinearSVR(**kwargs)

        def fit(self, X, y):
            self.model.fit(X, y)

            self.coef_ = self.model.coef_
            self.intercept_ = self.model.intercept_

            def add_coef(arr, fn):
                arr.append(fn(self.coef_))

            add_coef(coeffs_state['max'], np.max)
            add_coef(coeffs_state['min'], np.min)

            return self

        def get_params(self, deep=True):
            return self.model.get_params(deep)

        def set_params(self, **kwargs):
            self.model.set_params(**kwargs)
            return self

        def predict(self, X):
            return self.model.predict(X)

        def score(self, X, y, sample_weight=None):
            if sample_weight is not None:
                return self.model.score(X, y, sample_weight)
            else:
                return self.model.score(X, y)

        @staticmethod
        def permute_min_coefs():
            return coeffs_state['min']

        @staticmethod
        def permute_max_coefs():
            return coeffs_state['max']

        @staticmethod
        def reset_perm_coefs():
            coeffs_state['min'] = []
            coeffs_state['max'] = []
Example #34
0
def build_svm(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a support vector regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """

    clf = LinearSVR(random_state=1, dual=False, epsilon=0,
                    loss='squared_epsilon_insensitive')
    # Random state has int value for non-random sampling
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('../trained_networks/svm_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return
Example #35
0
cat_vars = ['DayOfWeek','Promo','StateHoliday','SchoolHoliday','StoreType','Assortment','CompetitionOpenSinceMonth',
            'CompetitionOpenSinceYear','Promo2','Promo2SinceWeek','Promo2SinceYear','PromoInterval','Day','Month','Year']


num_vars = ['Open','Store','CompetitionDistance','ratio1','ratio2']



X_trn, X_val = train_test_split(train, test_size=0.012, random_state=10)

print 'Training Stage 1 Models'

#train svm
svm1 = LinearSVR(verbose=True)
svm1.fit(X_trn[cat_vars+num_vars],X_trn['Sales'])
svm1_feature = svm1.predict(train[cat_vars+num_vars])
preds = svm1.predict(X_val[cat_vars+num_vars])
print 'svm ',(np.mean(((np.exp(preds)-np.exp(X_val['Sales']))/(np.exp(X_val['Sales'])+1))**2))**0.5


#train xgb
dtrain = xgb.DMatrix(X_trn[cat_vars+num_vars],X_trn['Sales'])
dvalid = xgb.DMatrix(X_val[cat_vars+num_vars],X_val['Sales'])
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

num_boost_round = 50
params1 = {"objective": "reg:linear","booster" : "gbtree",
"eta": 0.5,"max_depth": 2,"subsample": 0.5,"colsample_bytree": 0.4,
"nthread":4,"silent": 1,"seed": 1301}
gbm1 = xgb.train(params1, dtrain, num_boost_round, evals=watchlist,early_stopping_rounds=50, feval=rmspe_xg, verbose_eval=True)
Example #36
0
    linsvr = LinearSVR(epsilon=0.1, tol=1e-4, C=1.0, loss='squared_epsilon_insensitive')
    linsvr.fit(explanatory_df, response_series)
    linsvr_rsq[c] = svr.score(explanatory_df, response_series)
    
    # prediction and linear extrapolation of training data set to get further predictions.
    test_cluster = train_cluster.copy()
    
    explanatory_testdf = test_cluster[explanatory_features]
    response_testseries = test_cluster.y
    
    for i in range(0,(len(cluster_i) - 5)):
       test_cluster.loc[i] = [cluster_i.iloc[i], cluster_i.iloc[i+1], 
                            cluster_i.iloc[i+2], cluster_i.iloc[i+3], 
                            cluster_i.iloc[i+4],
                            linsvr.predict(explanatory_df)[i]]
    
    # further running time series to predict into the future
    j = len(test_cluster) - 1
    for i in range(j, j+forecast_years):
       explanatory_testdf = test_cluster[explanatory_features]
       test_list = test_cluster.ix[i,1:6].tolist()
       y_est = linsvr.predict(explanatory_testdf)
       test_list.append(y_est[i])
       test_series = pd.Series(test_list, index = train_cluster.columns)
       test_cluster = test_cluster.append(test_series, ignore_index = True)
    
    linsvr_test_clustery[c] = test_cluster['y']
    linsvr_residuals = test_cluster['y'][0:len(train_cluster)] - train_cluster['y']
    
    linsvr_RMSE[c] = (((linsvr_residuals)**2).mean())**(0.5)
class TextLearner(object):
    def __init__(self,data_path,model_path = "./",name = ""):
        self.name = name
        self.data_path = data_path
        self.model_path = model_path
        self.DesignMatrix = []
        self.TestMatrix = []
        self.X_train = []
        self.y_train = [] # not only train but general purpose too
        self.X_test = []
        self.y_test  = []
        self.y_pred = []
        self.vectorizer = None
        self.feature_names = None
        self.chi2 = None
        self.mlModel = None
        self.F = Filter()

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.DesignMatrix = []
        self.TestMatrix = []
        self.X_train = []
        self.y_train = []
        self.X_test = []
        self.y_test  = []
        self.y_pred = []
        self.vectorizer = None
        self.feature_names = None
        self.chi2 = None
        self.mlModel = None
        self.F = None

    def addModelDetails(self,model_p,name = ""):
        self.name = name
        self.model_path = model_p


    def load_data(self,TrTe = 0):               #TrTe => 0-Train  1-Test # returns the dimensions of vectors
        with open( self.data_path, 'rb') as f:
            if TrTe == 0:
                self.DesignMatrix = pickle.load(f)
                return len(self.DesignMatrix[1])
            if TrTe == 1:
                self.TestMatrix = pickle.load(f)
                return len(self.TestMatrix[1])

    def clearOld(self):
        self.X_train = []
        self.y_train = []
        self.X_test = []
        self.y_test  = []
        self.y_pred = []
        self.vectorizer = None
        self.feature_names = None
        self.chi2 = None
        self.mlModel = None


    def process(self,text,default = 0):
        if default == 0:
            text = text.strip().lower().encode("utf-8")
        else:
            text = self.F.process(text)
        return text


    def loadXY(self,TrTe = 0,feature_index = 0,label_index = 1):     #TrTe => 0-Train  1-Test
        if TrTe == 0:
            for i in self.DesignMatrix:
                self.X_train.append(self.process(i[feature_index]))
                self.y_train.append(i[label_index])
            self.X_train = np.array(self.X_train)
            self.y_train = np.array(self.y_train)

        elif TrTe == 1:
            for i in self.TestMatrix:
                self.X_test.append(self.process(i[feature_index]))
                self.y_test.append(i[label_index])
            self.X_test = np.array(self.X_test)
            self.y_test = np.array(self.y_test)


    def featurizeXY(self,only_train = 1):      # Extracts Features
        sw = ['a', 'across', 'am', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'been', 'being', 'but', 'by', 'can', 'could', 'did', 'do', 'does', 'each', 'for', 'from', 'had', 'has', 'have', 'in', 'into', 'is', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'of', 'on', 'or', 'that', "that's", 'thats', 'the', 'there', "there's", 'theres', 'these', 'this', 'those', 'to', 'under', 'until', 'up', 'were', 'will', 'with', 'would']
        self.vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words=sw)
        self.X_train = self.vectorizer.fit_transform(self.X_train)
        self.feature_names = self.vectorizer.get_feature_names()
        if only_train == 0:
            self.X_test = self.vectorizer.transform(self.X_test)


    def reduceDimension(self,only_train = 1, percent = 50):      # Reduce dimensions / self best of features
        n_samples, n_features = self.X_train.shape
        k = int(n_features*(percent/100))

        self.chi2 = SelectKBest(chi2, k=k)
        self.X_train = self.chi2.fit_transform(self.X_train, self.y_train)
        self.feature_names = [self.feature_names[i] for i in self.chi2.get_support(indices=True)]
        self.feature_names = np.asarray(self.feature_names)
        if only_train == 0:
            self.X_test = self.chi2.transform(self.X_test)


    def trainModel(self,Model = "default"):
        if Model == "default":
            self.mlModel = LinearSVR(loss='squared_epsilon_insensitive',dual=False, tol=1e-3)
        else:
            self.mlModel = Model
        self.mlModel.fit(self.X_train, self.y_train)


    def testModel(self,approx = 1):        # returns score ONLY
        self.y_pred = np.array(self.mlModel.predict(self.X_test))

        if approx == 1:
            ### To convert real valued results to binary for scoring
            temp = []
            for y in self.y_pred:
                if y > 0.0:
                    temp.append(1.0)
                else:
                    temp.append(-1.0)
            self.y_pred = temp

        return metrics.accuracy_score(self.y_test, self.y_pred)


    def getReport(self,save = 1, get_top_words = 0):       # returns report
        report = ""
        if get_top_words == 1:
            if hasattr(self.mlModel, 'coef_'):
                    report += "Dimensionality: " + str(self.mlModel.coef_.shape[1])
                    report += "\nDensity: " +  str(density(self.mlModel.coef_))

                    rank = np.argsort(self.mlModel.coef_[0])
                    top10 = rank[-20:]
                    bottom10 = rank[:20]
                    report += "\n\nTop 10 keywords: "
                    report += "\nPositive: " + (" ".join(self.feature_names[top10]))
                    report += "\nNegative: " + (" ".join(self.feature_names[bottom10]))

        score = metrics.accuracy_score(self.y_test, self.y_pred)
        report += "\n\nAccuracy: " + str(score)
        report += "\nClassification report: "
        report += "\n\n" + str(metrics.classification_report(self.y_test, self.y_pred,target_names=["Negative","Positive"]))
        report += "\nConfusion matrix: "
        report += "\n\n" + str(metrics.confusion_matrix(self.y_test, self.y_pred)) + "\n\n"

        if save == 1:
            with open(self.model_path + "report.txt", "w") as text_file:
                text_file.write(report)

        return report


    def crossVal(self,folds = 5, dim_red = 50,full_iter = 0, save = 1):        # returns report # Caution: resets train and test X,y
        skf = cross_validation.StratifiedKFold(self.y_train, n_folds = folds,shuffle=True)
        print(skf)
        master_report = ""

        X_copy = self.X_train
        y_copy = self.y_train

        for train_index, test_index in skf:
            self.X_train, self.X_test = X_copy[train_index], X_copy[test_index]
            self.y_train, self.y_test = y_copy[train_index], y_copy[test_index]
            self.featurizeXY(0)
            self.reduceDimension(0,dim_red)
            self.trainModel()
            self.testModel()
            master_report += self.getReport(save = 0,get_top_words = 0)
            if full_iter == 1:
                continue
            else:
                break

        if save == 1:
            with open(self.model_path + "master_report.txt", "w") as text_file:
                text_file.write(master_report)

        return master_report


    def save_obj(self,obj, name ):
        with open(self.model_path + name + '.pkl', 'wb') as f:
            pickle.dump(obj, f,  protocol=2)


    def saveModel(self):        # saves in model path
        self.save_obj(self.mlModel, self.name + "_model")
        self.save_obj(self.vectorizer, self.name + "_vectorizer")
        self.save_obj(self.chi2, self.name + "_feature_selector")


    def plot(self):
        '''
        beta (Just plotting the model) (Not working)
        '''

        h = .02  # step size in the mesh
        # create a mesh to plot in
        x_min, x_max = self.X_train[:, 0].min() - 1, self.X_train[:, 0].max() + 1
        y_min, y_max = self.X_train[:, 1].min() - 1, self.X_train[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, m_max]x[y_min, y_max].
        Z = self.mlModel.predict(np.c_[xx.ravel(), yy.ravel()])

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        plt.contour(xx, yy, Z, cmap=plt.cm.Paired)

        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())
        plt.xticks(())
        plt.yticks(())
        plt.title(self.name)
        plt.savefig(self.model_path + 'plot.png')
Example #38
0
def linearSVR(train,trainLable,testData):
    clf = LinearSVR()  
    clf.fit(train,trainLable)  
    predict = clf.predict(testData)  
    return predict  
# 通过交叉验证来选择C
best_cv_score = -1e+30;
for log2c in np.arange(-10,30,1):
    clf = LinearSVR(C=2**log2c, epsilon=0.0001)
    clf.fit(x_input_minmax, y_input)
    cv_score = cross_val_score(cv=sample_num, estimator=clf, X=x_input_minmax, y=y_input, scoring= 'mean_squared_error').mean() # 留1
    print(cv_score)
    if cv_score > best_cv_score:
        best_cv_score = cv_score
        bestc = 2**log2c


# 利用所选的参数进行预测
clf = LinearSVR(C=bestc, epsilon=0.0001)
clf.fit(x_input_minmax, y_input)
y_pred = clf.predict(x_input_minmax)
# y_pred = y_scaler.inverse_transform(y_pred.reshape(-1,1))

view_point = 5;
plt.plot(x_input[:,view_point], y_input, 'bo-', x_input[:,view_point], y_pred, 'rs-')
plt.grid(True)
plt.legend(['y', 'y_pred'])
plt.show()







	combined = np.append(X, np.matrix(Y).T, axis=1) 
	np.random.shuffle(combined)
	tail_size = -1 * size
	last_column = X.shape[1]
	training_labels = combined[:tail_size, last_column]
	training_data = combined[:tail_size, :-2]
	test_data = combined[tail_size:, :-2]
	actual_labels = combined[tail_size:, last_column]
	return training_data, np.ravel(training_labels), test_data, np.ravel(actual_labels)

training = open('author_features')
NO_TRAINING_SAMPLES = 6000
NO_OF_AUTHORS = 10000
matrix = dok_matrix((NO_TRAINING_SAMPLES, NO_OF_AUTHORS), dtype=np.int)
for line in training.readlines():
	values = line.rstrip().split()
	matrix[int(values[0]), int(values[1])] = 1

labels_file = open('year_training_labels')
labels = [int(x) for x in labels_file.readline().rstrip().split()]

training_matrix = matrix[:4498]
training_data, training_labels, test_data, actual_labels = sample(training_matrix, labels)
classifier = LinearSVR()
classifier.fit(training_data, training_labels)
output = classifier.predict(test_data)
for index, predicted in enumerate(output):
	print '%s %s' % (predicted, actual_labels[index])

print metrics.explained_variance_score(actual_labels, output)
    print "----------- Fold %d -----------------------" %i
    print "--------------------------------------------"
    
    val_id = fold_ids.ix[:, i].dropna()
    idx = train["Id"].isin(list(val_id))
    
    trainingSet = train[~idx]
    validationSet = train[idx]
    
    tr_X = np.matrix(trainingSet[feature_names])
    tr_Y = np.array(trainingSet["Response"])
    val_X = np.matrix(validationSet[feature_names])
    val_Y = np.array(validationSet["Response"])
    
    regm = LinearSVR(C = 0.06, epsilon = 0.45, tol = 1e-5,
                     dual = True, verbose = True, random_state = 133)
                     
    regm.fit(tr_X, tr_Y)    
    preds = regm.predict(val_X)
    
    df = pd.DataFrame(dict({"Id" : validationSet["Id"], "ground_truth" : validationSet["Response"], 
                            "linsvr_preds" : preds}))
    
    linsvr_val = linsvr_val.append(df, ignore_index = True)
    
    tpreds = regm.predict(test_X)
    cname = "Fold" + `i`
    linsvr_test[cname] = tpreds
    
linsvr_val.to_csv("ensemble2/linsvr_val.csv")
linsvr_test.to_csv("ensemble2/linsvr_test.csv")
Example #42
0
    X2 = X_train_reduced[test]
    Y2 = Y_train_raw[test]

    ## Train Classifiers on fold
    rdg_clf = Ridge(alpha=0.5)
    rdg_clf.fit(X1, Y1)
    lso_clf = Lasso(alpha=0.6257)
    lso_clf.fit(X1, Y1)
    svr_clf = LinearSVR(C=1e3)
    svr_clf.fit(X1, Y1)

    ## Score Classifiers on fold
    rdg_clf_score = rdg_clf.score(X2, Y2)
    lso_clf_score = lso_clf.score(X2, Y2)
    svr_clf_score = svr_clf.score(X2, Y2)

    print "Ridge:  ", rdg_clf_score
    print "Lasso:  ", lso_clf_score
    print "SVR_RBF:  ", svr_clf_score


## Train final Classifiers
# clf = Ridge(alpha=.5)
clf = LinearSVR(C=1e3, gamma=0.1)
clf.fit(X_train_reduced, Y_train_raw)
Y_predicted = clf.predict(X_test_reduced)

## Save results to csv
np.savetxt("prediction.csv", Y_predicted, fmt="%.5f", delimiter=",")
svm_reg2 =LinearSVR(epsilon=0.5)
svm_reg1.fit(X, y)
svm_reg2.fit(X, y)



def find_support_vectors(svm_reg, X, y):
	y_pred = svm_reg.predict(X)
	off_margin = (np.abs(y - y_pred) >= svm_reg.epsilon)
	return np.argwhere(off_margin)

svm_reg1.support_ = find_support_vectors(svm_reg1, X, y)
svm_reg2.support_ = find_support_vectors(svm_reg2, X, y)

eps_x1 = 1
eps_y_pred = svm_reg1.predict([[eps_x1]])


def plot_svm_regression(svm_reg, X, y, axes):
	x1s = np.linspace(axes[0], axes[1], 100).reshape(100, 1)
	y_pred = svm_reg.predict(x1s)
	plt.plot(x1s, y_pred, "k-", linewidth=2, label=r"$\hat{y}$")
	plt.plot(x1s, y_pred + svm_reg.epsilon, "k--")
	plt.plot(x1s, y_pred - svm_reg.epsilon, "k--")
	plt.scatter(X[svm_reg.support_], y[svm_reg.support_], s=180, facecolors="#FFAAAA")
	plt.plot(X, y, "bo")
	plt.xlabel(r"$x_1$", fontsize=18)
	plt.legend(loc="upper left", fontsize=18)
	plt.axis(axes)

plt.figure(figsize=(9, 4))
Example #44
0
    for row in csv.reader(data_file):
        data += [[row[0],row[4],row[6],row[10]]]
        target += [row[9]]

data,target = Lin_clean_data(data[1:],target[1:],2)

point = 2000
X_train = data[:point-1]
X_test = data[point:point+int(point*0.2)]
y_train = target[:point-1]
y_test = target[point:point+int(point*0.2)]


svr = LinearSVR(C=0.1)
svr_model = svr.fit(X_train,y_train)
lin = svr.predict(X_train)
lin_test = svr.predict(X_test)

lin,lin_test = data_normalize(y_train,y_test,lin,lin_test)

print("Train score : ",score(y_train,lin))
print("Train average error : ",sum(abs(y_train-lin)) / float(len(y_train)))

print("Fit score : ",score(y_test,lin_test))
print("Fit average error : ",sum(abs(y_test-lin_test)) / float(len(y_test)))

figure1 = plt.figure(1,figsize=[20,10])
draw_pic(range(len(X_train)),range(len(X_test)),lin,lin_test,y_train,y_test,label='lin',figure=figure1)
figure1.savefig("C:/Users/sean/Desktop/SVR_DATA/linSVR.png",dpi=300,format="png")
plt.close(1)