コード例 #1
0
def test_polynomial_features():
    """Test Polynomial Features"""
    X1 = np.arange(6)[:, np.newaxis]
    P1 = np.hstack([np.ones_like(X1),
                    X1, X1 ** 2, X1 ** 3])
    deg1 = 3

    X2 = np.arange(6).reshape((3, 2))
    x1 = X2[:, :1]
    x2 = X2[:, 1:]
    P2 = np.hstack([x1 ** 0 * x2 ** 0,
                    x1 ** 1 * x2 ** 0,
                    x1 ** 0 * x2 ** 1,
                    x1 ** 2 * x2 ** 0,
                    x1 ** 1 * x2 ** 1,
                    x1 ** 0 * x2 ** 2])
    deg2 = 2

    for (deg, X, P) in [(deg1, X1, P1), (deg2, X2, P2)]:
        P_test = PolynomialFeatures(deg, include_bias=True).fit_transform(X)
        assert_array_almost_equal(P_test, P)

        P_test = PolynomialFeatures(deg, include_bias=False).fit_transform(X)
        assert_array_almost_equal(P_test, P[:, 1:])

    interact = PolynomialFeatures(2, interaction_only=True, include_bias=True)
    X_poly = interact.fit_transform(X)
    assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]])

    assert_raises(ValueError, interact.transform, X[:, 1:])
コード例 #2
0
ファイル: train.py プロジェクト: scnakandala/hummingbird
    def fit(self, data, args):
        self.model = PolynomialFeatures()

        with Timer() as t:
            self.model.fit(data.X_train, data.y_train)

        return t.interval
コード例 #3
0
ファイル: test_data.py プロジェクト: zkuncheva/scikit-learn
def test_polynomial_features():
    # Test Polynomial Features
    X1 = np.arange(6)[:, np.newaxis]
    P1 = np.hstack([np.ones_like(X1), X1, X1**2, X1**3])
    deg1 = 3

    X2 = np.arange(6).reshape((3, 2))
    x1 = X2[:, :1]
    x2 = X2[:, 1:]
    P2 = np.hstack([
        x1**0 * x2**0, x1**1 * x2**0, x1**0 * x2**1, x1**2 * x2**0,
        x1**1 * x2**1, x1**0 * x2**2
    ])
    deg2 = 2

    for (deg, X, P) in [(deg1, X1, P1), (deg2, X2, P2)]:
        P_test = PolynomialFeatures(deg, include_bias=True).fit_transform(X)
        assert_array_almost_equal(P_test, P)

        P_test = PolynomialFeatures(deg, include_bias=False).fit_transform(X)
        assert_array_almost_equal(P_test, P[:, 1:])

    interact = PolynomialFeatures(2, interaction_only=True, include_bias=True)
    X_poly = interact.fit_transform(X)
    assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]])
コード例 #4
0
ファイル: main.py プロジェクト: bloomen/kaggle-titanic
class PolyFeatureGenerator(TransformerMixin):
    def __init__(self, degree):
        self._poly = PolynomialFeatures(degree=degree)

    def transform(self, df, *_):
        df_poly = self._poly.transform(df)
        df_poly = pd.DataFrame(df_poly, columns=self._poly.get_feature_names())
        df_poly.index = df.index
        df = pd.concat([df, df_poly], axis=1)
        return df

    def fit(self, df, *_):
        self._poly.fit(df)
        return self
コード例 #5
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
コード例 #6
0
    def polynomial(self):
        poly = PolynomialFeatures(degree=3)
        self.training_order_start_end_districts_and_time = poly.fit_transform(
            self.training_order_start_end_districts_and_time,
            self.training_number_of_orders)
        predict = poly.transform(
            self.testing_order_start_end_districts_and_time)

        clf = linear_model.LinearRegression()
        clf.fit(self.training_order_start_end_districts_and_time,
                self.training_number_of_orders)
        predicted_number_of_orders = clf.predict(predict)
        current_ride_prediction_error = numpy.mean(
            (predicted_number_of_orders - self.testing_number_of_orders)**2)
        print(current_ride_prediction_error)
        print(clf.coef_)
コード例 #7
0
    def __gen_model(self, model = LinearRegression()):
        model = Pipeline([('poly', PolynomialFeatures(degree=3)),
            ('linear', LinearRegression(fit_intercept=False))])
        X_train, y_train, _ = self.getDataSet()
        model.fit(X_train, y_train)
#         print "mode coef: ",
#         print model.named_steps['linear'].coef_
        self.model = model
コード例 #8
0
class PolynomialFeaturesImpl():
    def __init__(self, degree=2, interaction_only=False, include_bias=True):
        self._hyperparams = {
            'degree': degree,
            'interaction_only': interaction_only,
            'include_bias': include_bias
        }

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def transform(self, X):
        return self._sklearn_model.transform(X)
コード例 #9
0
ファイル: train.py プロジェクト: scnakandala/hummingbird
class CreatePolynomialFeatures(CreateModel):
    def fit(self, data, args):
        self.model = PolynomialFeatures()

        with Timer() as t:
            self.model.fit(data.X_train, data.y_train)

        return t.interval

    def test(self, data):
        assert self.model is not None

        return self.model.transform(data.X_test)

    def predict(self, data):
        with Timer() as t:
            self.predictions = self.test(data)

        data.learning_task = LearningTask.REGRESSION
        return t.interval
コード例 #10
0
ファイル: test_trainer.py プロジェクト: stbalduin/memobuilder
def test_ols_with_boston_dataset():

    # load boston dataset
    dataset = load_boston()

    # create metamodel
    input_names = list(dataset.inputs)
    response_names = list(dataset.responses)
    metamodel = metamodels.OLSModel(
        preprocessors=[PolynomialFeatures(degree=2)],
        input_names=input_names,
        response_names=response_names)

    # create trainer and fit metamodel to the dataset
    result = Trainer().fit(metamodel, dataset)

    print('score:', result.score)

    # score: 0.682539990982
    assert result.score > 0.68
    assert result.score < 0.69
コード例 #11
0
boston = load_boston()
#print(boston)
#通过DESCR属性可以查看数据集的详细情况,这里数据有14列,前13列为特征数据,最后一列为标签数据。
#print(boston.DESCR)
#boston的data和target分别存储了特征和标签
#print(boston.data)
#print(boston.target)
#切分数据集
X_train, X_test, y_train, y_test = train_test_split(boston.data,
                                                    boston.target,
                                                    test_size=0.2,
                                                    random_state=2)

#增加特征多项式让线性回归模型更好地拟合数据
#多项式的个数的不断增加,可以在训练集上有很好的效果,但很容易造成过拟合
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)
#多项式线性回归
model2 = LinearRegression(normalize=True)
model2.fit(X_train_poly, y_train)
mutilScore = model2.score(X_test_poly, y_test)
print(mutilScore)

#模型测试,并利用均方根误差(MSE)对测试结果进行评价
#模型的拟合值
y_pred = model2.predict(X_test_poly)
print("MSE:", metrics.mean_squared_error(y_test, y_pred))

#交叉验证
predicted = cross_val_predict(model2, boston.data, boston.target, cv=10)
コード例 #12
0
                             vocabulary=wordslist.keys(),
                             ngram_range=(1, 4),
                             stop_words=stopwordlist)
X_main = vectorizer.fit_transform(corpus)
print "Main Words Shape", X_main.shape

vectPoly = TfidfVectorizer(analyzer="word",
                           vocabulary=polywords,
                           ngram_range=(1, 4),
                           use_idf=True,
                           stop_words=stopwordlist)
poly = vectPoly.fit_transform(corpus)
print "Poly Words Shape", poly.shape

polyFeatures = PolynomialFeatures(degree=2,
                                  interaction_only=True,
                                  include_bias=False)
X_poly = polyFeatures.fit_transform(poly.todense())
print "Poly Words into Poly Features Shape", X_poly.shape

X = np.concatenate((X_main.toarray(), X_poly), axis=1)
#X=np.concatenate((X_main.toarray(),power(poly.toarray(),3)),axis=1)
print "Matrix Shape", X.shape
'''
outfile = open('OUTPUT_6_articles_TFIDF.txt', 'w')
for item in X:
	temp = str(item)
	outfile.write(temp)

outfile.close()
'''
コード例 #13
0
with open(filename, "r") as filestream:
    for line in filestream:
        current = line.split(",")
        Z.append([i for i in current[0:len(current)]])

#Shuffle Matrix for Cross Validation
Z = np.asarray(Z)
np.random.shuffle(Z)

#Split Matrix in X,Y
X = []
Y = []
X, B, Y = np.hsplit(Z, [Z.shape[1] - 1, Z.shape[1] - 1])

#Non-Linear
X = PolynomialFeatures(1).fit_transform(X)

#Get Float Data
X = X.astype(np.float)
classes = np.unique(Y)
for i in range(0, len(classes)):
    classes[i] = classes[i].strip()
classes = np.unique(classes)

y = []
for i in range(0, len(Y)):
    for j in range(0, len(classes)):
        if (Y[i].item(0).strip() == classes[j]):
            y.append(j)
Y = np.asarray(y)
Y = Y.astype(np.float)
コード例 #14
0
    data = np.loadtxt(dataFile,delimiter=',',skiprows=1,usecols=(16,18,21,24,8,11,12,9,10))
    CoHOMO = np.loadtxt(dataFile,delimiter=',',skiprows=2,usecols=(68,70))
    therm = np.loadtxt(dataFile,delimiter=',',skiprows=2,usecols=(8,9,10))



    predictors = np.column_stack((CoHOMO[:,1]))


#    predictors = np.column_stack((data[:,0],data[:,2],data[:,3],data[:,4])) #LowdwinH2, Buried, VBuried, pka, r2=0.70951(hyd), r2(h2)=0.2226


#    predictors = np.column_stack((data[:,-4],data[:,-3],data[:,4])) #tau,tau,pka r2=0.7004262



#    hydricities = data[:,-1] #actually for H2binding
    hydricities = therm[:,1]

    # compound features
    polyFeatures = PolynomialFeatures(degree=1,interaction_only=True)
    regressor = make_pipeline(polyFeatures, LinearRegression())
#    regressor = LinearRegression()

    regressor.fit(predictors, hydricities)
    predictions = regressor.predict(predictors)
    print('R^2: ', regressor.score(predictors, hydricities))
    scatterPlot.plotScatterPlot(hydricities, predictions, (Path.home() / 'Desktop' / 'ianPredictions'))
#    print(regressor.coef_,regressor.intercept_)
    pass
コード例 #15
0
def validate(params):
    transf_type = params['transf_type']

    if transf_type == 'drop':
        transf = FunctionTransformer(drop_transform, validate=False)
    elif transf_type == 'dr+inp+sc+pca':
        transf = make_pipeline(
            drop_transform,
            SimpleImputer(),
            StandardScaler(),
            PCA(n_components=params['n_pca_components']),
        )
    elif transf_type == 'dr+inp':
        transf = make_pipeline(
            drop_transform,
            SimpleImputer(),
        )
    elif transf_type == 'dr+inp+sc':
        transf = make_pipeline(drop_transform, SimpleImputer(),
                               StandardScaler())
    elif transf_type == 'union':
        transf = create_union_transf(params)
    elif transf_type == 'poly_kbest':
        transf = make_pipeline(
            drop_transform,
            SimpleImputer(),
            StandardScaler(),
            PolynomialFeatures(degree=2, interaction_only=True),
            SelectKBest(f_regression, params['best_features']),
        )
    else:
        raise AttributeError(f'unknown transformer type: {transf_type}')

    est_type = params['est_type']

    if est_type == 'xgboost':
        est = create_xgb_est(params)
    elif est_type == 'gblinear':
        est = create_gblinear_est(params)
    elif est_type == 'exttree':
        est = ExtraTreesRegressor(n_estimators=params['n_estimators'],
                                  n_jobs=-1)
    elif est_type == 'gp':
        est = GaussianProcessRegressor()
    elif est_type == 'ridge':
        est = Ridge(alpha=params['alpha'])
    else:
        raise AttributeError(f'unknown estimator type: {est_type}')

    if params['bagging']:
        BaggingRegressor(est,
                         n_estimators=params['n_bag_estimators'],
                         max_features=1.,
                         max_samples=1.)

    pl = make_pipeline(transf, est)

    if params['per_group_regr']:
        pl = PerGroupRegressor(estimator=pl,
                               split_condition=['os', 'cpuFreq', 'memSize_MB'],
                               n_jobs=1,
                               verbose=1)

    return cv_test(pl, n_folds=params['n_folds'])
コード例 #16
0
    data.loc[data['Weekday'] == i, 'expensive than average weekday'] = data.loc[data['Weekday'] == i, 'Price'] - \
                                                                      data.loc[data['Weekday'] == i, 'Price'].mean()
for i in range(1, 366):
    data.loc[data['Date'] == i, 'expensive than average date'] = data.loc[data['Date'] == i, 'Price'] - \
                                                                      data.loc[data['Date'] == i, 'Price'].mean()
for i in range(2):
    data.loc[data['Apartment'] == i, 'expensive than average apartment'] = data.loc[data['Apartment'] == i, 'Price'] - \
                                                                      data.loc[data['Apartment'] == i, 'Price'].mean()
for i in range(1, 5):
    data.loc[data['Beds'] == i, 'expensive than average bed'] = data.loc[data['Beds'] == i, 'Price'] - \
                                                                data.loc[data['Beds'] == i, 'Price'].mean()
threshold1 = Binarizer(threshold=3.0)
res1 = pd.DataFrame(threshold1.transform(data['Review'].values.reshape(-1, 1)))
threshold2 = Binarizer(threshold=80)
res2 = pd.DataFrame(threshold2.transform(data['Price'].values.reshape(-1, 1)))
pf = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

res3 = pd.DataFrame(
    pf.fit_transform(
        data[['Apartment', 'Beds', 'Review', 'Pic Quality', 'Price']]))

encoder = OneHotEncoder()
data_region1hot = encoder.fit_transform(data['Region'].values.reshape(-1, 1))
data_region = pd.DataFrame(data_region1hot.toarray())
data_weekday1hot = encoder.fit_transform(data['Weekday'].values.reshape(-1, 1))
data_weekday = pd.DataFrame(data_weekday1hot.toarray())
data_reformed = pd.concat(
    [data.drop(columns=['ID']), data_region, data_weekday, res1, res2, res3],
    axis=1)

Seed = 40
コード例 #17
0
#selecting based on best performance
#    predictors = np.column_stack((NBO[:,0],sa[:,0],sa[:,1],bv[:,0],CoHOMO[:,0],CoHOMO[:,1],CoHOMO[:,2],CoLUMO[:,0],CoLUMO[:,1],ba[:,0],ba[:,1],ba[:,2],ba[:,3],ba[:,4],ba[:,5],lt[:,0],lt[:,1],lt[:,2],lt[:,4],lt[:,5]))



#######Training targets  ###
#    hydricities = CoHOMO[:,1]
#    hyduns = np.column_stack((therm[:,1])).reshape((-1,1))
#    scaler = StandardScaler()
#    hydricities2 = hydricities.reshape((-1,1))
#    hydricities=scale(hydricities2)
#    print(hyd1)

    # compound features
    polyFeatures = PolynomialFeatures(degree=1,interaction_only=False)
    regressor = make_pipeline(polyFeatures, StandardScaler(), LassoCV(max_iter=60000, cv=KFold(n_splits=5, shuffle=True)))
#    regressor = make_pipeline(polyFeatures, LassoCV(max_iter=60000, cv=KFold(n_splits=5, shuffle=True)))
#    regressor = make_pipeline(polyFeatures, StandardScaler(), LassoCV(max_iter=60000))
#    regressor = make_pipeline(polyFeatures, StandardScaler(), LassoCV(max_iter=60000))
#    regressor = make_pipeline(polyFeatures, StandardScaler(), Ridge())
#    regressor = make_pipeline(polyFeatures, StandardScaler(), Lasso(alpha=0.035, max_iter=70000))#, fit_intercept=True))
#    regressor = make_pipeline(polyFeatures, StandardScaler(), Lasso(alpha=0, max_iter=70000))#, fit_intercept=True))
#    regressor = make_pipeline(polyFeatures, StandardScaler(), LinearRegression())
#    regressor = make_pipeline(polyFeatures, LinearRegression())
#    regressor = RandomForestRegressor(oob_score=True,n_estimators=2000)
    



コード例 #18
0
ファイル: main.py プロジェクト: bloomen/kaggle-titanic
 def __init__(self, degree):
     self._poly = PolynomialFeatures(degree=degree)
コード例 #19
0
ファイル: polynomial_features.py プロジェクト: sreev/lale
 def __init__(self, degree=2, interaction_only=False, include_bias=True):
     self._hyperparams = {
         'degree': degree,
         'interaction_only': interaction_only,
         'include_bias': include_bias}
     self._wrapped_model = SKLModel(**self._hyperparams)
コード例 #20
0
ファイル: Model.py プロジェクト: sjl421/ML-Pipeline
def create_model(
    model_type,
    feature_scaling=False,
    polynomial_degree=1,
    cross_validation=False,
    alpha=1.0,
    C=None,
    kernel=None,
    svr_epsilon=None,
    svr_degree=None,
    svr_gamma=None,
    svr_coef0=None,
    sparse=False,
):
    """ Creates a new model of the specified type.

    Args:
        model_type (str): The type of model to create. Use one of the MODEL_TYPE_X constants.
        feature_scaling (bool): If feature scaling is to be used.
        polynomial_degree (int): If higher than 1, polynomial feature transformation will be applied.
        cross_validation (bool): If cross validation is to be applied, if applicable to the model type.
        alpha (float): The regularization parameter. Will only be used if applicable to the model type.
        C: The regularization parameter für SVR. Will only be used if applicable to the model type.
        kernel (str): The kernel to use, if applicable to the model type.
        sparse (bool): If a sparse feature matrix is used.
        svr_epsilon (float): Epsilon parameter for SVR. Specifies the epsilon tube. (see sklearn for more info)
        svr_degree (int): Polynomial degree parameter for the SVR kernel 'poly'
        svr_gamma (float): Kernel coefficient for SVR kernels 'rbf', 'poly' and 'sigmoid'
        svr_coef0 (float): Independent term (or bias) for SVR kernels 'poly' and 'sigmoid'

    Returns:
        (sklearn.pipeline.Pipeline) The estimator model.
    """
    assert polynomial_degree > 0, "Polynomial degree must be higher than 0!"
    model_type = model_type.upper()
    logging.debug("Creating model with type %s" % model_type)
    if model_type == MODEL_TYPE_LINREG:
        model = create_linear_regression_model()
    elif model_type == MODEL_TYPE_RIDREG:
        if cross_validation:
            model = create_ridge_cv_model(alpha)
        else:
            model = create_ridge_model(alpha)
    elif model_type == MODEL_TYPE_SVR:
        if cross_validation:
            model = create_svr_cv_model(C, kernel, svr_epsilon, svr_degree,
                                        svr_gamma, svr_coef0)
        else:
            model = create_svr_model(C, kernel, svr_epsilon, svr_degree,
                                     svr_gamma, svr_coef0)
    else:
        raise ValueError("The model type %s is not supported." % model_type)

    steps = []
    if polynomial_degree > 1:
        if not sparse:
            steps.append(
                ("poly", PolynomialFeatures(degree=polynomial_degree)))
        else:
            logging.warning(
                "Polynomial Features for sparse matrices are not supported!")
    if feature_scaling:
        if sparse:
            scaler = SparseScaler()
        else:
            scaler = StandardScaler()
        steps.append(("scale", scaler))
    steps.append((model_type, model))

    return Pipeline(steps)