Beispiel #1
0
def basicPredictor(vecnumber):
    procvec, diagvec, diagdict = getFeatures.getFeatures()
    procvec1 = procvec["Medicare"]
    diagvec1 = diagvec["Medicare"]
    procvec2 = procvec["Medicaid"]
    diagvec2 = diagvec["Medicaid"]
    procvec3 = procvec["Uninsured"]
    diagvec3 = diagvec["Uninsured"]
    keylist = procvec1.keys()
    keylist.sort()
    veclist = []
    diaglist = []
    statenamelist = []
    i = vecnumber
    for key in keylist:
        vecs = procvec1[key]
        countvec = vecs[i]
        veclist.append(countvec)
        diaglist.append(diagvec1[key])

        try:
            veclist.append(procvec2[key][i])
            diaglist.append(diagvec2[key])
        except:
            pass
        try:
            veclist.append(procvec3[key][i])
            diaglist.append(diagvec3[key])
        except:
            pass

        statenamelist.append(key)
    veclist = np.asarray(veclist)
    diaglist = np.asarray(diaglist)
    '''
    #encodes only top 5 diagnoses for each input vector, does not give good accuracy!!
    diagindex=np.argpartition(diaglist,-5,axis=1)[:,-5:]
    hotlist=[]
    for i in range(len(diaglist)):
        hot=np.zeros((len(diaglist[0]),),dtype=float)
        hot[diagindex[i]]=1.0
        hotlist.append(hot)
    hotlist=np.asarray(hotlist)
    hotlist=preprocessing.normalize(hotlist)
    
    '''
    #print diagindex
    hotlist = preprocessing.normalize(diaglist)
    standardscaler = preprocessing.StandardScaler()
    vecs = standardscaler.fit_transform(veclist)

    #  for testing how size of training data affects accuracy
    sizevec = [0.11, 0.22, 0.33, 0.44, 0.55, 0.66]
    error = []
    for i in sizevec:
        xtrain, xtest, ytrain, ytest = train_test_split(vecs,
                                                        hotlist,
                                                        test_size=i,
                                                        random_state=17)
        knn = neighbors.KNeighborsRegressor()
        knn.fit(xtrain, ytrain)
        predictedys = knn.score(xtest, ytest)
        error.append(predictedys)
    print error
    '''
        100,
        1000,
    ),
    'weights': (
        'uniform',
        'distance',
    ),
    'p': (
        1,
        2,
    ),
}]

# Choose the regressor est (=estimator)
# This should be changed to try different regressors.
est = neighbors.KNeighborsRegressor()

# Use GridSearch to find the best combination of hyper-parameters
gs = GridSearchCV(est,
                  cv=10,
                  param_grid=hyper_params,
                  verbose=2,
                  n_jobs=n_jobs,
                  scoring='r2')

# Train the MLA and take the time
t0 = time.time()
gs.fit(x_train, y_train.ravel())
runtime = time.time() - t0
print("kNN complexity and bandwidth selected and model fitted in %.6f s" %
      runtime)
Beispiel #3
0
def SVR_train(*data):
    X, Y = data
    ####3.1决策树回归####
    from sklearn import tree
    model_DecisionTreeRegressor = tree.DecisionTreeRegressor()
    ####3.2线性回归####
    from sklearn import linear_model
    model_LinearRegression = linear_model.LinearRegression()
    ####3.3SVM回归####
    from sklearn import svm
    model_SVR = svm.SVR()
    model_SVR2 = svm.SVR(kernel='rbf', C=100, gamma=0.1)
    ####3.4KNN回归####
    from sklearn import neighbors
    model_KNeighborsRegressor = neighbors.KNeighborsRegressor()
    ####3.5随机森林回归####
    from sklearn import ensemble
    model_RandomForestRegressor = ensemble.RandomForestRegressor(
        n_estimators=20)  # 这里使用20个决策树
    ####3.6Adaboost回归####
    from sklearn import ensemble
    model_AdaBoostRegressor = ensemble.AdaBoostRegressor(
        n_estimators=50)  # 这里使用50个决策树
    ####3.7GBRT回归####
    from sklearn import ensemble
    model_GradientBoostingRegressor = ensemble.GradientBoostingRegressor(
        n_estimators=100)  # 这里使用100个决策树
    ####3.8Bagging回归####
    from sklearn.ensemble import BaggingRegressor
    model_BaggingRegressor = BaggingRegressor()
    ####3.9ExtraTree极端随机树回归####
    from sklearn.tree import ExtraTreeRegressor
    model_ExtraTreeRegressor = ExtraTreeRegressor()

    # Create the (parametrised) models
    # print("Hit Rates/Confusion Matrices:\n")
    models = [
        ("model_DecisionTreeRegressor", model_DecisionTreeRegressor),
        ("model_LinearRegression", model_LinearRegression),
        (
            "model_SVR",
            model_SVR2  #model_SVR
        ),
        ("model_KNeighborsRegressor", model_KNeighborsRegressor),
        ("model_RandomForestRegressor", model_RandomForestRegressor),
        ("model_AdaBoostRegressor", model_AdaBoostRegressor),
        ("model_GradientBoostingRegressor", model_GradientBoostingRegressor),
        ("model_BaggingRegressor", model_BaggingRegressor),
        ("model_ExtraTreeRegressor", model_ExtraTreeRegressor)
    ]

    for m in models:

        #X = X.reset_index(drop=True)
        #print(X)
        # y = y.reset_index(drop=True)
        # print(y)

        from sklearn.model_selection import KFold
        kf = KFold(n_splits=2, shuffle=False)

        for train_index, test_index in kf.split(X):
            # print(train_index, test_index)
            # print(X.loc[[0,1,2]])

            X_train, X_test, y_train, y_test = X[train_index], X[
                test_index], Y[train_index], Y[
                    test_index]  # 这里的X_train,y_train为第iFold个fold的训练集,X_val,y_val为validation set
            #print(X_test, y_test)
            #print(X_train, y_train)
            print('======================================')

            import datetime
            starttime = datetime.datetime.now()

            print("正在训练%s模型:" % m[0])
            m[1].fit(X_train, y_train)

            # Make an array of predictions on the test set
            pred = m[1].predict(X_test)

            # Output the hit-rate and the confusion matrix for each model
            score = m[1].score(X_test, y_test)
            print("%s:\n%0.3f" % (m[0], m[1].score(X_test, y_test)))
            # print("%s\n" % confusion_matrix(y_test, pred, labels=[-1.0, 1.0]))#labels=["ant", "bird", "cat"]

            from sklearn.metrics import r2_score
            r2 = r2_score(y_test, pred)
            print('r2: ', r2)

            endtime = datetime.datetime.now()
            print('%s训练,预测耗费时间,单位秒:' % m[0], (endtime - starttime).seconds)

            #result = m[1].predict(X_test)
            import matplotlib.pyplot as plt
            plt.figure()
            plt.plot(np.arange(len(pred)), y_test, 'go-', label='true value')
            plt.plot(np.arange(len(pred)), pred, 'ro-', label='predict value')
            plt.title('score: %f' % score)
            plt.legend()
            plt.show()
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn.datasets import make_regression
from matplotlib import pyplot as plt
import numpy as np

# ----------------- Generate Synthetic Data ---------------#
X_R, y_R = make_regression(n_samples=100,
                           n_features=1,
                           n_informative=1,
                           bias=150.0,
                           noise=30)
fig, subaxes = plt.subplots(5, 1, figsize=(11, 8), dpi=100)
X = np.linspace(-3, 3, 500).reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X_R, y_R, random_state=0)
# --------------------------- KNN -------------------------#
for K, K in zip(subaxes, [1, 3, 7, 15, 59]):
    knn_reg = neighbors.KNeighborsRegressor(n_neighbors=K)
    knn_reg.fit(X_train, y_train)
    y_predict_output = knn_reg.predict(X)
    plt.plot(X, y_predict_output)
    plt.plot(X_train, y_train, 'o', alpha=0.9, label='Train')
    plt.plot(X_test, y_test, '^', alpha=0.9, label='Test')
    plt.xlabel('Input feature')
    plt.ylabel('Target value')
    plt.title('KNN Regression (K={})\n$'.format(K))
    plt.legend()

plt.show()
Beispiel #5
0
# Compute target and add noise
y = np.sinc(X).ravel()
y += 0.2 * (0.5 - np.random.rand(y.size))

# Plot input data
plt.figure()
plt.scatter(X, y, s=40, c='k', facecolors='none')
plt.title('Input data')

# Create the 1D grid with 10 times the density of the input data
x_values = np.linspace(-0.5 * amplitude, 0.5 * amplitude,
                       10 * num_points)[:, np.newaxis]

# Number of neighbors to consider
n_neighbors = 8

# Define and train the regressor
knn_regressor = neighbors.KNeighborsRegressor(n_neighbors, weights='distance')
y_values = knn_regressor.fit(X, y).predict(x_values)

plt.figure()
plt.scatter(X, y, s=40, c='k', facecolors='none', label='input data')
plt.plot(x_values, y_values, c='k', linestyle='--', label='predicted values')
plt.xlim(X.min() - 1, X.max() + 1)
plt.ylim(y.min() - 0.2, y.max() + 0.2)
plt.axis('tight')
plt.legend()
plt.title('K Nearest Neighbors Regressor')

plt.show()
Beispiel #6
0
def knn(data, startAt, stopAt=None):
    """
    Classifies the point between startAt and stopAt with the k-nearest
    neighbors method. Automaticaly finds the best number of neighbors.
    If stopAt is not provided, default value is the length of data.

    Parameters:
        data (pandas.DataFrame): Data returned by prepare_data (may be
                                 differentiated)
        startAt (int): Index where the forecast starts
        stopAt (int): Index where the forecast stops
            (default is None)

    Returns:
        predictions (list): The forecast from startAt up to stopAt
    """

    data_copy = data.copy()

    if (stopAt is None):
        stopAt = len(data_copy)

    periods = stopAt - startAt

    from fastai.tabular import add_datepart
    add_datepart(data_copy, 'Date')
    data_copy.drop('Elapsed', axis=1, inplace=True)

    # setting importance of days before and after weekends
    # we assume that fridays and mondays are more important
    # 0 is Monday, 1 is Tuesday...
    data_copy['mon_fri'] = 0
    data_copy['mon_fri'].mask(data_copy['Dayofweek'].isin([0, 4]),
                              1,
                              inplace=True)
    data_copy['mon_fri'].where(data_copy['Dayofweek'].isin([0, 4]),
                               0,
                               inplace=True)

    train = data_copy[:startAt]
    valid = data_copy[startAt:stopAt]

    from sklearn import neighbors
    from sklearn.model_selection import GridSearchCV
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler(feature_range=(0, 1))

    x_train_scaled = scaler.fit_transform(train.drop('Close', axis=1))
    x_train = pd.DataFrame(x_train_scaled)
    y_train = train['Close']

    x_valid_scaled = scaler.fit_transform(valid.drop('Close', axis=1))
    x_valid = pd.DataFrame(x_valid_scaled)
    y_valid = valid['Close']

    params = {'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9]}
    knn = neighbors.KNeighborsRegressor()
    model = GridSearchCV(knn, params, cv=5, iid=False)

    model.fit(x_train, y_train)
    predictions = model.predict(x_valid)

    return predictions
Beispiel #7
0
features = ['GrLivArea']
#features = ['TotalBsmtSF']
viz_cont_cont(house_train, features, target)

features_to_filter = ['Id']
filter_features(house_train, features_to_filter)

#do one-hot-encoding for all the categorical features
house_train1 = one_hot_encode(house_train)
house_train1.shape
house_train1.info()

#filter_features(house_train1, ['SalePrice','log_sale_price'])
filter_features(house_train1, ['SalePrice'])
X_train = house_train1
y_train = house_train['SalePrice']

X_train.shape
#Step 1 model
rf_estimator = ensemble.RandomForestClassifier(n_estimators=100)
feature_imp_df, X_train1 = feature_selection_from_model(
    rf_estimator, X_train, y_train)
X_train1.shape

scaled_model = get_scale_model(X_train1)
X_train1 = scaled_model.transform(X_train1)
knn_estimator = neighbors.KNeighborsRegressor()
knn_grid = {'n_neighbors': [15, 20]}
model = fit_model(knn_estimator, knn_grid, X_train1, y_train)
#Best score: 0.182414942974
#.score: 0.17392101913392907
Beispiel #8
0
df["enc_state"] = state_encoder.fit_transform(df["State"])
df["enc_state"]
################
df.head()

df.drop("State", axis=1, inplace=True)
df.info()

X = df.drop("Profit", axis=1)
y = df["Profit"]

Xtrain, Xtest, ytrain, ytest = model_selection.train_test_split(
    X, y, test_size=0.15, random_state=42)
Xtrain.info()

knnmodel = neighbors.KNeighborsRegressor(n_neighbors=11)
knnmodel.fit(Xtrain, ytrain)
#fit doesnt create model ,but it create a data structure which help us to search easier
#kdtree
#balltree
#brute
#alogorithm="........."

prediction = knnmodel.predict(Xtest)
print(np.sqrt(metrics.mean_squared_error(ytest, prediction)))

X[:3]

#standard  scaling
avg = df.rs.mean()
sd = df.rs.std()
Beispiel #9
0
#     result = clf.predict(x_test)
#     score = mse(y_test,result)
#     plt.figure()
#     plt.plot(np.arange(len(result)), y_test,'go-',label='true value')
#     plt.plot(np.arange(len(result)),result,'ro-',label='predict value')
#     plt.title('score: %f'%score)
#     plt.legend()
#     plt.show()

from sklearn import neighbors
from sklearn import svm
from sklearn import ensemble

# rf =ensemble.RandomForestRegressor(n_estimators=20)#这里使用20个决策树
# svr = svm.SVR()
knn = neighbors.KNeighborsRegressor(n_neighbors=4)
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=42)

results = []
sub_array = []
train = train.values
y_train = train_Y.values
test1 = test1.values

# model xgb _ cv
for model in [knn]:
    for traincv, testcv in cv.split(train, y_train):
        knn.fit(train[traincv], y_train[traincv])
        y_tmp = knn.predict(train[testcv])
Beispiel #10
0
def k_nearest_neighbors(other_args: List[str], s_ticker: str,
                        df_stock: pd.DataFrame):
    """
    Train KNN model
    Parameters
    ----------
    other_args: List[str]
        List of argparse arguments
    s_ticker: str
        Ticker
    df_stock: pd.DataFrame
        Dataframe of stock prices

    Returns
    -------

    """
    parser = argparse.ArgumentParser(
        add_help=False,
        prog="knn",
        description="""
            K nearest neighbors is a simple algorithm that stores all
            available cases and predict the numerical target based on a similarity measure
            (e.g. distance functions).
        """,
    )

    parser.add_argument(
        "-i",
        "--input",
        action="store",
        dest="n_inputs",
        type=check_positive,
        default=40,
        help="number of days to use as input for prediction.",
    )
    parser.add_argument(
        "-d",
        "--days",
        action="store",
        dest="n_days",
        type=check_positive,
        default=5,
        help="prediction days.",
    )
    parser.add_argument(
        "-j",
        "--jumps",
        action="store",
        dest="n_jumps",
        type=check_positive,
        default=1,
        help="number of jumps in training data.",
    )
    parser.add_argument(
        "-n",
        "--neighbors",
        action="store",
        dest="n_neighbors",
        type=check_positive,
        default=20,
        help="number of neighbors to use on the algorithm.",
    )
    parser.add_argument(
        "-e",
        "--end",
        action="store",
        type=valid_date,
        dest="s_end_date",
        default=None,
        help="The end date (format YYYY-MM-DD) to select for testing",
    )

    parser.add_argument(
        "-t",
        "--test_size",
        default=0.2,
        dest="valid_split",
        type=float,
        help="Percentage of data to validate in sample",
    )
    parser.add_argument(
        "-p",
        "--pp",
        action="store",
        dest="s_preprocessing",
        default="none",
        choices=["normalization", "standardization", "minmax", "none"],
        help="pre-processing data.",
    )

    try:
        ns_parser = parse_known_args_and_warn(parser, other_args)
        if not ns_parser:
            return

        (
            X_train,
            X_valid,
            y_train,
            y_valid,
            _,
            _,
            _,
            y_dates_valid,
            forecast_data_input,
            dates_forecast_input,
            scaler,
            is_error,
        ) = prepare_scale_train_valid_test(df_stock["5. adjusted close"],
                                           ns_parser)
        if is_error:
            print("Error preparing data")
            return
        print(
            f"Training on {X_train.shape[0]} sequences of length {X_train.shape[1]}.  Using {X_valid.shape[0]} sequences "
            f" of length {X_valid.shape[1]} for validation")
        future_dates = get_next_stock_market_days(dates_forecast_input[-1],
                                                  n_next_days=ns_parser.n_days)

        # Machine Learning model
        knn = neighbors.KNeighborsRegressor(n_neighbors=ns_parser.n_neighbors)
        knn.fit(
            X_train.reshape(X_train.shape[0], X_train.shape[1]),
            y_train.reshape(y_train.shape[0], y_train.shape[1]),
        )

        preds = knn.predict(X_valid.reshape(X_valid.shape[0],
                                            X_valid.shape[1]))
        forecast_data = knn.predict(forecast_data_input.reshape(1, -1))

        forecast_data_df = pd.DataFrame(
            [i if i > 0 else 0 for i in forecast_data.T], index=future_dates)
        print_pretty_prediction(forecast_data_df[0],
                                df_stock["5. adjusted close"].values[-1])
        plot_data_predictions(
            df_stock,
            preds,
            y_valid,
            y_dates_valid,
            scaler,
            f"KNN Model with {ns_parser.n_neighbors} Neighbors on {s_ticker}",
            forecast_data_df,
            1,
        )

    except Exception as e:
        print(e)
        print("")
Beispiel #11
0
from sklearn import model_selection

X_train, X_test, y_train, y_test  = model_selection.train_test_split(X,y, test_size=0.3)


# In[30]:




# In[55]:

from sklearn import neighbors, metrics
from matplotlib import pyplot as plt

knn= neighbors.KNeighborsRegressor(n_neighbors = 12)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
    


# In[57]:


sizes = {} # clé : coordonnées ; valeur : nombre de points à ces coordonnées
for (yt, yp) in zip(list(y_test), list(y_pred)):
    if (yt, yp) in sizes.keys():
        sizes[(yt, yp)] += 1
    else:
        sizes[(yt, yp)] = 1
Beispiel #12
0
import pandas as pd
import numpy as np
import sklearn.cross_validation as cross_val
import sklearn.neighbors as neighbors
from sklearn.preprocessing import scale
from sklearn.datasets import load_boston

boston = load_boston()

features = scale(boston.data)
target = boston.target
kf = cross_val.KFold(len(features),n_folds=5, shuffle=True, random_state=42)
res = []
for par in np.linspace(1.0, 10.0, num=200):
    print('p = %f') % par 
    estimator = neighbors.KNeighborsRegressor(n_neighbors=5, weights='distance',  p=par, metric='minkowski')
    score = cross_val.cross_val_score(estimator, features, target, cv = kf, scoring='mean_squared_error').mean()
    res.append(score) 
    print ('score = %f') % score
print(sorted(res))
Beispiel #13
0
def train_models(data, attrs, Target) -> None:
    warnings.filterwarnings("ignore",
                            category=FutureWarning,
                            module="sklearn",
                            lineno=166)
    #Machine Learning Algorithm (MLA) Selection and Initialization
    MLA = [
        #ensemble.AdaBoostRegressor(),
        #ensemble.BaggingRegressor(),
        #ensemble.ExtraTreesRegressor(n_estimators=10),
        #ensemble.GradientBoostingRegressor(),
        #XGBRegressor(),
        #gaussian_process.GaussianProcessRegressor(),
        ensemble.RandomForestRegressor(n_estimators=30, max_depth=5),
        linear_model.Ridge(alpha=0.0001),
        linear_model.Lasso(alpha=0.0001, selection='random'),
        neighbors.KNeighborsRegressor(),
        svm.SVR(kernel='rbf', gamma='auto'),
        tree.DecisionTreeRegressor(max_depth=5),
    ]

    #split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit
    #note: this is an alternative to train_test_split
    cv_split = model_selection.ShuffleSplit(
        n_splits=10, test_size=.2, train_size=.8, random_state=0
    )  # run model 10x with 60/30 split intentionally leaving out 10%

    #create table to compare MLA metrics
    MLA_columns = [
        'MLA Name', 'MLA Train Accuracy Mean', 'MLA Test Accuracy Mean',
        'MLA Test Accuracy 3*STD', 'MLA Time'
    ]
    MLA_compare = pd.DataFrame(columns=MLA_columns)

    #create table to compare MLA predictions
    MLA_predict = data[Target]
    data_target = utils.column_or_1d(MLA_predict.values.ravel(), warn=True)
    data_features = data[attrs]
    pd.options.mode.chained_assignment = None
    #index through MLA and save performance to table

    row_index = 0
    for alg in MLA:
        #set name and parameters
        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        #     MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
        #     print(MLA_name)
        #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
        cv_results = model_selection.cross_validate(
            alg,
            data_features,
            data_target,
            cv=cv_split,
            scoring='neg_mean_absolute_error',
            return_train_score=True)
        MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
        MLA_compare.loc[
            row_index,
            'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
        MLA_compare.loc[
            row_index,
            'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()
        #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
        MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results[
            'test_score'].std() * 3  #let's know the worst that can happen!

        #save MLA predictions - see section 6 for usage
        alg.fit(data_features, data_target)
        MLA_predict[MLA_name] = alg.predict(data_features)

        row_index += 1

    MLA_compare.sort_values(by=['MLA Test Accuracy Mean'],
                            ascending=False,
                            inplace=True)
    print(MLA_compare)
Beispiel #14
0
    n_splits = 5
    kf = KFold(n_splits=n_splits)
    for i, (train_ind, val_ind) in enumerate(kf.split(train)):
        tra = train.drop(['血糖'], axis=1).values[train_ind]
        tra_label = train['血糖'].values[train_ind]
        val = train.drop(['血糖'], axis=1).values[val_ind]
        val_label = train['血糖'].values[val_ind]

        pred = pd.DataFrame()
        # lasso
        lasso = linear_model.Lasso(alpha=0.005455)
        lasso.fit(tra, tra_label)
        pred['lasso'] = lasso.predict(val)
        # knn
        knn = neighbors.KNeighborsRegressor(n_neighbors=25,
                                            weights='uniform',
                                            n_jobs=-1)
        knn.fit(tra, tra_label)
        pred['knn'] = knn.predict(val)
        # svr
        svr = svm.SVR(kernel='rbf', C=10, gamma=0.01)
        svr.fit(tra, tra_label)
        pred['svr'] = svr.predict(val)
        # xgboost
        dtrain = xgb.DMatrix(tra, label=tra_label)
        dval = xgb.DMatrix(val)
        base_score = train['血糖'].sum() / train['血糖'].shape[0]
        param_gbtree = {
            'booster': 'gbtree',
            'eta': 0.01,
            'gamma': 0,
#print header
#X, y = preprocessDataWithoutScale(data)

joblib.dump(X_scaler, 'pickle-final/X_scaler.pkl')
joblib.dump(y_scaler, 'pickle-final/y_scaler.pkl')
joblib.dump(imp, 'pickle-final/Imputer.pkl')
joblib.dump(vec, 'pickle-final/Vector.pkl')

estimators = []

# K-Nearest Neighbors
estimators.append({
    "name":
    "KNN",
    "model":
    neighbors.KNeighborsRegressor(weights="uniform", n_neighbors=5)
})

# Gradient Boosting Regressor
estimators.append({
    "name":
    "GBR",
    "model":
    ensemble.GradientBoostingRegressor(max_features=0.1,
                                       n_estimators=2100,
                                       max_depth=6,
                                       min_samples_leaf=1,
                                       learning_rate=0.02)
})

# Random Forest
Beispiel #16
0
def dict_method_reg():
    dict_method = {}
    # 1st part
    """1SVR"""
    me1 = SVR(kernel='rbf', gamma='auto', degree=3, tol=1e-3, epsilon=0.1, shrinking=False, max_iter=2000)
    cv1 = 5
    scoring1 = 'r2'
    param_grid1 = [{'C': [1, 0.75, 0.5, 0.25, 0.1], 'epsilon': [0.01, 0.001, 0.0001]}]
    dict_method.update({"SVR-set": [me1, cv1, scoring1, param_grid1]})

    """2BayesianRidge"""
    me2 = BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False,
                        copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06,
                        n_iter=300, normalize=False, tol=0.01, verbose=False)
    cv2 = 5
    scoring2 = 'r2'
    param_grid2 = [{'alpha_1': [1e-07, 1e-06, 1e-05], 'alpha_2': [1e-07, 1e-05, 1e-03]}]
    dict_method.update({'BayR-set': [me2, cv2, scoring2, param_grid2]})

    """3SGDRL2"""
    me3 = SGDRegressor(alpha=0.0001, average=False,
                       epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15,
                       learning_rate='invscaling', loss='squared_loss', max_iter=1000,
                       penalty='l2', power_t=0.25,
                       random_state=0, shuffle=True, tol=0.01,
                       verbose=0, warm_start=False)
    cv3 = 5
    scoring3 = 'r2'
    param_grid3 = [{'alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-05]}]
    dict_method.update({'SGDRL2-set': [me3, cv3, scoring3, param_grid3]})

    """4KNR"""
    me4 = neighbors.KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2,
                                        metric='minkowski')
    cv4 = 5
    scoring4 = 'r2'
    param_grid4 = [{'n_neighbors': [3, 4, 5, 6]}]
    dict_method.update({"KNR-set": [me4, cv4, scoring4, param_grid4]})

    """5kernelridge"""
    kernel = 1.0 * RBF(1.0)
    me5 = kernel_ridge.KernelRidge(alpha=1, kernel=kernel, gamma="scale", degree=3, coef0=1, kernel_params=None)
    cv5 = 5
    scoring5 = 'r2'
    param_grid5 = [{'alpha': [100, 10, 1, 0.1, 0.01, 0.001]}]
    dict_method.update({'KRR-set': [me5, cv5, scoring5, param_grid5]})

    """6GPR"""
    # kernel = 1.0 * RBF(1.0)
    kernel = Matern(length_scale=0.1, nu=0.5)
    me6 = gaussian_process.GaussianProcessRegressor(kernel=kernel, alpha=1e-10, optimizer='fmin_l_bfgs_b',
                                                    n_restarts_optimizer=10,
                                                    normalize_y=False, copy_X_train=True, random_state=0)
    cv6 = 5
    scoring6 = 'r2'
    param_grid6 = [{'alpha': [1e-11, 1e-10, 1e-9, 1e-8, 1e-7]}]
    dict_method.update({"GPR-set": [me6, cv6, scoring6, param_grid6]})

    # 2nd part

    """6RFR"""
    me7 = ensemble.RandomForestRegressor(n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1,
                                         min_weight_fraction_leaf=0.0, max_leaf_nodes=None, min_impurity_decrease=0.0,
                                         min_impurity_split=None, bootstrap=True, oob_score=False,
                                         random_state=None, verbose=0, warm_start=False)
    cv7 = 5
    scoring7 = 'r2'
    param_grid7 = [{'max_depth': [3, 4, 5, 6]}]
    dict_method.update({"RFR-em": [me7, cv7, scoring7, param_grid7]})

    """7GBR"""
    me8 = ensemble.GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100,
                                             subsample=1.0, criterion='friedman_mse', min_samples_split=2,
                                             min_samples_leaf=1, min_weight_fraction_leaf=0.,
                                             max_depth=3, min_impurity_decrease=0.,
                                             min_impurity_split=None, init=None, random_state=None,
                                             max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None,
                                             warm_start=False, presort='auto')
    cv8 = 5
    scoring8 = 'r2'
    param_grid8 = [{'max_depth': [3, 4, 5, 6]}]
    dict_method.update({'GBR-em': [me8, cv8, scoring8, param_grid8]})

    "AdaBR"
    dt = DecisionTreeRegressor(criterion="mae", splitter="best", max_features=None, max_depth=3, min_samples_split=2)
    me9 = AdaBoostRegressor(dt, n_estimators=100, learning_rate=1, loss='square', random_state=0)
    cv9 = 5
    scoring9 = 'r2'
    param_grid9 = [{'n_estimators': [50, 120, 100, 200]}]
    dict_method.update({"AdaBR-em": [me9, cv9, scoring9, param_grid9]})

    '''TreeR'''
    me10 = DecisionTreeRegressor(
        criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
        min_weight_fraction_leaf=0.0, max_features=None, random_state=0, max_leaf_nodes=None,
        min_impurity_decrease=0.0, min_impurity_split=None, presort=False)
    cv10 = 5
    scoring10 = 'r2'
    param_grid10 = [{'max_depth': [3, 4, 5, 6], 'min_samples_split': [2, 3, 4]}]
    dict_method.update({'TreeC-em': [me10, cv10, scoring10, param_grid10]})

    'ElasticNet'
    me11 = ElasticNet(alpha=1.0, l1_ratio=0.7, fit_intercept=True, normalize=False, precompute=False, max_iter=1000,
                      copy_X=True, tol=0.0001, warm_start=False, positive=False, random_state=None)

    cv11 = 5
    scoring11 = 'r2'
    param_grid11 = [{'alpha': [0.0001, 0.001, 0.01, 0.1, 1], 'l1_ratio': [0.3, 0.5, 0.8]}]
    dict_method.update({"ElasticNet-L1": [me11, cv11, scoring11, param_grid11]})

    'Lasso'
    me12 = Lasso(alpha=1.0, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000,
                 tol=0.001,
                 warm_start=False, positive=False, random_state=None, )

    cv12 = 5
    scoring12 = 'r2'
    param_grid12 = [{'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 100, 1000]}, ]
    dict_method.update({"Lasso-L1": [me12, cv12, scoring12, param_grid12]})

    """SGDRL1"""
    me13 = SGDRegressor(alpha=0.0001, average=False,
                        epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15,
                        learning_rate='invscaling', loss='squared_loss', max_iter=1000,
                        penalty='l1', power_t=0.25,
                        random_state=0, shuffle=True, tol=0.01,
                        verbose=0, warm_start=False)
    cv13 = 5
    scoring13 = 'r2'
    param_grid13 = [{'alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-5, 1e-6, 1e-7], "epsilon": [0.1, 0.2, 1]}]
    dict_method.update({'SGDR-L1': [me13, cv13, scoring13, param_grid13]})

    """LinearSVR"""
    me14 = LinearSVR(epsilon=0.0, tol=1e-4, C=1.0,
                     loss='epsilon_insensitive', fit_intercept=True,
                     intercept_scaling=1., dual=True, verbose=0,
                     random_state=3, max_iter=1000)
    cv14 = 5
    scoring14 = 'r2'
    param_grid14 = [{'C': [10, 6, 5, 3, 2.5, 1, 0.75, 0.5, 0.25, 0.1], 'epsilon': [0.0, 0.1]}]
    dict_method.update({"LinearSVR-set": [me14, cv14, scoring14, param_grid14]})

    return dict_method
Beispiel #17
0
from sklearn import neighbors, linear_model
from graphic import census_2011, census_2015, land_area
import numpy as np

model_1 = linear_model.LinearRegression()

model_2 = neighbors.KNeighborsRegressor(n_neighbors=2)

# model learning this model_1 is learning use all data and second model use k-element neighbors that allows for us
# get more accuracy data
model_1.fit(np.c_[census_2011], np.c_[land_area])

model_2.fit(np.c_[census_2011], np.c_[land_area])

# in the end we can predict areas any regions(use their properties)
print(model_1.predict([[100_000]]))
print(model_2.predict([[100_000]]))
Beispiel #18
0
def knn_cv_search(X_train, y_train, list_neighbors=None, cv_parameter=5\
   , scoring_parameter='accuracy', limit_list=(3,11) ):
    '''Search for best neighbours count for KNN classifier.
   Best number is provided from best MSE score over all cross-validations
   '''
    #----------------------------------------------------------------------------
    # Creation d'une liste de nombre de voisins impairs
    #----------------------------------------------------------------------------
    if list_neighbors is None:
        myList = list(range(limit_list[0], limit_list[1]))

        filtered_neighbors = filter(lambda x: x % 2 != 0, myList)
        list_neighbors = list(filtered_neighbors)
    else:
        pass

    #----------------------------------------------------------------------------
    # Liste contenant les scores moyens de la recherche croisée (CV)
    #----------------------------------------------------------------------------
    list_cv_mean_scores = list()

    min_index = 0
    scores_mean = 0.0
    import time
    t0 = time.time()

    #----------------------------------------------------------------------------
    # Search for best neighbors count over folds
    #----------------------------------------------------------------------------
    for neighbor in list_neighbors:
        knn_clf = neighbors.KNeighborsRegressor(n_neighbors=neighbor)

        # knn_clf = KNeighborsClassifier(n_neighbors=neighbor)
        # -----------------------------------------------------------------------
        # Get all scores over all cross validations folds
        # -----------------------------------------------------------------------

        scores = cross_val_score(knn_clf\
        ,X_train, y_train, cv=cv_parameter, scoring = scoring_parameter)

        # -----------------------------------------------------------------------
        #Get mean of this scores for the given neighbor
        # -----------------------------------------------------------------------
        list_cv_mean_scores.append(scores.mean())

    print(
        "KNN classifier: Elapsed time for searching best neighbors number= %0.3fs"
        % (time.time() - t0))

    #----------------------------------------------------------------------------
    # Erreur de classification minimale
    #----------------------------------------------------------------------------
    if scoring_parameter == 'accuracy' or scoring_parameter == 'r2':
        #-------------------------------------------------------------------------
        # Le meilleur score va a la valeur la plus proche de 1, signant ainsi
        # une plus grande précision.
        #-------------------------------------------------------------------------
        list_score = [1 - x for x in list_cv_mean_scores]
    else:
        #-------------------------------------------------------------------------
        # Le meilleur score va a la valeur la plus faible, signant une moindre
        # perte.
        #-------------------------------------------------------------------------
        list_score = list_cv_mean_scores

    min_index = list_score.index(min(list_score))

    #----------------------------------------------------------------------------
    # Extraction du meilleur nombre de voisins
    #----------------------------------------------------------------------------
    best_neighbors = list_neighbors[min_index]
    print("Optimal number for neighbors= %d" % best_neighbors)
    return best_neighbors, list_neighbors, list_score
Beispiel #19
0

## 4.5 支持向量机
from sklearn.svm import SVC
model = SVC(C=1.0, kernel=’rbf’, gamma=’auto’)
"""参数
---
    C:误差项的惩罚参数C
    gamma: 核相关系数。浮点数,If gamma is ‘auto’ then 1/n_features will be used instead.
"""

## 4.6 k近邻算法 KNN
from sklearn import neighbors
#定义kNN分类模型
model = neighbors.KNeighborsClassifier(n_neighbors=5, n_jobs=1) # 分类
model = neighbors.KNeighborsRegressor(n_neighbors=5, n_jobs=1) # 回归
"""参数
---
    n_neighbors: 使用邻居的数目
    n_jobs:并行任务数
"""

## 4.7 多层感知机
from sklearn.neural_network import MLPClassifier
# 定义多层感知机分类算法
model = MLPClassifier(activation='relu', solver='adam', alpha=0.0001)
"""参数
---
    hidden_layer_sizes: 元祖
    activation:激活函数
    solver :优化算法{‘lbfgs’, ‘sgd’, ‘adam’}
Beispiel #20
0
    x2_data = json.load(f)
with open('x3_data.json') as f:
    x3_data = json.load(f)

x1_norm = [(i - min(x1_data)) / (max(x1_data) - min(x1_data)) for i in x1_data]
x2_norm = [(i - min(x2_data)) / (max(x2_data) - min(x2_data)) for i in x2_data]
x3_norm = [(i - min(x3_data)) / (max(x3_data) - min(x3_data)) for i in x3_data]

# create training data
x_train = []
for i in range(len(x1_norm)):
    x_train.append([x1_norm[i], x2_norm[i], x3_norm[i]])

with open('y_data_k80.json') as f:
    y_train_K80 = json.load(f)
model_K80 = neighbors.KNeighborsRegressor(n_neighbors=3, weights='distance')
model_K80.fit(x_train, y_train_K80)

with open('y_data_p100.json') as f:
    y_train_P100 = json.load(f)
model_P100 = neighbors.KNeighborsRegressor(n_neighbors=3, weights='distance')
model_P100.fit(x_train, y_train_P100)

####################################################################


def send_signal(node, cmd):
    # Create a TCP/IP socket
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    port = 10000
    # Connect the socket to the port where the server is listening
features = ['Feature 1','Feature 2','Feature 3','Feature 4','Feature 5 (meaningless but please still use it)',
            'Feature 6','Feature 7','Feature 8','Feature 9','Feature 10']
le = preprocessing.LabelEncoder()

df = pd.read_csv("/Users/markloughman/Desktop/Machine Learning/DATA/TheSumDataSetWithNoise",sep=";",nrows = 10000)

catnum = df["Noisy Target Class"].tolist()

X = df.loc[:,features]
y = df["Noisy Target"]

n_neighbors = 5


for i, weights in enumerate(['uniform', 'distance']):
    lr = neighbors.KNeighborsRegressor(5, weights=weights)

#Needed to avoid members of target class being less that the number of folds
NMSE_results = cross_val_score(lr, X, y, cv=10,
                                   scoring="neg_mean_squared_error")  # Choose another regression metric

NMSE_results = NMSE_results * -1

RMS_results = np.sqrt(NMSE_results)

mean_error = RMS_results.mean()

abs_mean_error = cross_val_score(lr, X, y, cv=10, scoring="neg_mean_absolute_error")
abs_mean_error = abs_mean_error * -1
abs_mean_error = abs_mean_error.mean()
    print "Out-of-sample variance: %0.3f" % numpy.var(out_sample_errors)
    print "Out-of-sample mean: %0.3f" % numpy.mean(out_sample_errors)

    return (numpy.mean(out_sample_errors) + numpy.mean(in_sample_errors)) / 2

if __name__ == '__main__':
    dataset = utils.dict_to_numpy(
        utils.read_data_from_csv('data/winequality-red.csv'),
        columns_to_exclude = ['fixed acidity', 'chlorides', 'free sulfur dioxide'])

    data = dataset['data']
    target = dataset['target']
    attributes = dataset['attributes']

    X_train = data[:-100]
    X_test = data[-100:]
    Y_train = target[:-100]
    Y_test = target[-100:]

    print 'Linear regression'
    regression_model = linear_model.LinearRegression()
    regression(regression_model, X_train, X_test, Y_train, Y_test, 'linear')
    print

    for i in range(1, 9):
        print 'kNn regression for %s neighbors' % i
        regression_model = neighbors.KNeighborsRegressor(i)
        print 'Avg error %0.4f' % regression(regression_model, X_train, X_test, Y_train, Y_test, 'knn_%s' % i)
        print
Beispiel #23
0
# plt.show()

model = MLPRegressor()
predict_y = model.fit(train_X, train_gpa_y).predict(test_X)
m = mean_squared_error(test_gpa_y, predict_y)
print("MLP:%f" % m)

plt.scatter(test_gpa_y, predict_y)
plt.show()

model = tree.DecisionTreeRegressor()
predict_y = model.fit(train_X, train_gpa_y).predict(test_X)
m = mean_squared_error(test_gpa_y, predict_y)
print("决策树:%f" % m)

model = neighbors.KNeighborsRegressor()
predict_y = model.fit(train_X, train_gpa_y).predict(test_X)
mse = mean_squared_error(test_gpa_y, predict_y)
print("KNN:%f" % mse)

model = ensemble.RandomForestRegressor(n_estimators=20, random_state=1)
predict_y = model.fit(train_X, train_gpa_y).predict(test_X)
mse = mean_squared_error(test_gpa_y, predict_y)
print("随机森林:%f" % mse)

model = ensemble.GradientBoostingRegressor(n_estimators=100, random_state=1)
predict_y = model.fit(train_X, train_gpa_y).predict(test_X)
mse = mean_squared_error(test_gpa_y, predict_y)
print("GBRT:%f" % mse)

model = ensemble.BaggingRegressor(random_state=1)
Beispiel #24
0
#%% 7. KNN of reputation.
##knn model on predict reputation.
##variables: skill scores
xfifa=fifa.iloc[:,53:87]
yfifa=fifa['International_Reputation']
xsfifa = pd.DataFrame( scale(xfifa), columns=xfifa.columns ) 
ysfifa = yfifa.copy()
X_train, X_test, y_train, y_test = train_test_split(xsfifa, ysfifa, test_size = 0.25, random_state=2019)


#%%
rmse_val = [] 
for K in range(15):
    K=K+1
    model = neighbors.KNeighborsRegressor(n_neighbors = K)

    model.fit(X_train, y_train) 
    pred=model.predict(X_test) 
    error = sqrt(mean_squared_error(y_test,pred))
    rmse_val.append(error)
    print('RMSE value for k= ' , K , 'is:', error)


#%%
curve = pd.DataFrame(rmse_val)
curve.plot()



#%%
Beispiel #25
0
def dict_method_reg():
    dict_method = {}
    # 1st part
    """4KNR"""
    me4 = neighbors.KNeighborsRegressor(n_neighbors=5,
                                        weights='distance',
                                        algorithm='auto',
                                        leaf_size=30,
                                        p=2,
                                        metric='minkowski')
    cv4 = 5
    scoring4 = 'r2'
    param_grid4 = [{'n_neighbors': [4, 5, 6, 7, 8], "leaf_size": [10, 20, 30]}]
    dict_method.update({"KNR-set": [me4, cv4, scoring4, param_grid4]})
    """1SVR"""
    me1 = SVR(kernel='rbf',
              gamma='auto',
              degree=3,
              tol=1e-3,
              epsilon=0.1,
              shrinking=True,
              max_iter=2000)
    cv1 = 5
    scoring1 = 'r2'
    param_grid1 = [{'C': [10, 1, 0.1, 0.01, 0.001], 'kernel': ker}]
    dict_method.update({"SVR-set": [me1, cv1, scoring1, param_grid1]})
    """5kernelridge"""
    me5 = kernel_ridge.KernelRidge(alpha=1,
                                   gamma="scale",
                                   degree=3,
                                   coef0=-1,
                                   kernel_params=None)
    cv5 = 5
    scoring5 = 'r2'
    param_grid5 = [{'alpha': [10, 1, 0.1, 0.001], 'kernel': ker}]
    dict_method.update({'KRR-set': [me5, cv5, scoring5, param_grid5]})
    """6GPR"""
    me6 = gaussian_process.GaussianProcessRegressor(kernel=kernel,
                                                    alpha=1e-10,
                                                    optimizer='fmin_l_bfgs_b',
                                                    n_restarts_optimizer=0,
                                                    normalize_y=False,
                                                    copy_X_train=True,
                                                    random_state=0)
    cv6 = 5
    scoring6 = 'r2'
    param_grid6 = [{'kernel': ker}]
    dict_method.update({"GPR-set": [me6, cv6, scoring6, param_grid6]})

    # 2nd part
    """6RFR"""
    me7 = RandomForestRegressor(n_estimators=100,
                                max_depth=None,
                                min_samples_split=2,
                                min_samples_leaf=1,
                                min_weight_fraction_leaf=0.0,
                                max_leaf_nodes=None,
                                min_impurity_decrease=0.0,
                                min_impurity_split=None,
                                bootstrap=True,
                                oob_score=False,
                                random_state=None,
                                verbose=0,
                                warm_start=False)
    cv7 = 5
    scoring7 = 'r2'
    param_grid7 = [{
        'max_depth': [3, 4, 5],
    }]
    dict_method.update({"RFR-em": [me7, cv7, scoring7, param_grid7]})
    """7GBR"""
    me8 = GradientBoostingRegressor(
        loss='ls',
        learning_rate=0.1,
        n_estimators=100,
        subsample=1.0,
        criterion='friedman_mse',
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.,
        max_depth=3,
        min_impurity_decrease=0.,
        min_impurity_split=None,
        init=None,
        random_state=None,
        max_features=None,
        alpha=0.9,
        verbose=0,
        max_leaf_nodes=None,
        warm_start=False,
    )
    cv8 = 5
    scoring8 = 'r2'
    param_grid8 = [{'max_depth': [3, 4, 5, 6]}]
    dict_method.update({'GBR-em': [me8, cv8, scoring8, param_grid8]})

    "AdaBR"
    dt = DecisionTreeRegressor(criterion="mse",
                               splitter="best",
                               max_features=None,
                               max_depth=5,
                               min_samples_split=4)
    me9 = AdaBoostRegressor(dt,
                            n_estimators=200,
                            learning_rate=0.05,
                            loss='linear',
                            random_state=0)
    cv9 = 5
    scoring9 = 'explained_variance'
    param_grid9 = [{'n_estimators': [100, 200]}]
    dict_method.update({"AdaBR-em": [me9, cv9, scoring9, param_grid9]})
    '''DTR'''
    me10 = DecisionTreeRegressor(
        criterion="mse",
        splitter="best",
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.,
        max_features=None,
        random_state=0,
        max_leaf_nodes=None,
        min_impurity_decrease=0.,
        min_impurity_split=None,
    )
    cv10 = 5
    scoring10 = 'r2'
    param_grid10 = [{
        'max_depth': [2, 3, 4, 5, 6, 7],
        "min_samples_split": [2, 3, 4],
        "min_samples_leaf": [1, 2]
    }]
    dict_method.update({'DTR-em': [me10, cv10, scoring10, param_grid10]})

    'ElasticNet'
    me11 = ElasticNet(alpha=1.0,
                      l1_ratio=0.7,
                      fit_intercept=True,
                      normalize=False,
                      precompute=False,
                      max_iter=1000,
                      copy_X=True,
                      tol=0.0001,
                      warm_start=False,
                      positive=False,
                      random_state=None)

    cv11 = 5
    scoring11 = 'r2'
    param_grid11 = [{
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
        'l1_ratio': [0.3, 0.5, 0.8]
    }]
    dict_method.update({"EN-L1": [me11, cv11, scoring11, param_grid11]})

    'Lasso'
    me12 = Lasso(
        alpha=1.0,
        fit_intercept=True,
        normalize=False,
        precompute=False,
        copy_X=True,
        max_iter=1000,
        tol=0.001,
        warm_start=False,
        positive=False,
        random_state=None,
    )

    cv12 = 5
    scoring12 = 'r2'
    param_grid12 = [
        {
            'alpha': [
                0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 100,
                1000
            ]
        },
    ]
    dict_method.update({"LASSO-L1": [me12, cv12, scoring12, param_grid12]})
    """2BayesianRidge"""
    me2 = BayesianRidge(alpha_1=1e-06,
                        alpha_2=1e-06,
                        compute_score=False,
                        copy_X=True,
                        fit_intercept=True,
                        lambda_1=1e-06,
                        lambda_2=1e-06,
                        n_iter=300,
                        normalize=False,
                        tol=0.01,
                        verbose=False)
    cv2 = 5
    scoring2 = 'r2'
    param_grid2 = [{
        'alpha_1': [1e-07, 1e-06, 1e-05],
        'alpha_2': [1e-07, 1e-06, 1e-05]
    }]
    dict_method.update({'BRR-L1': [me2, cv2, scoring2, param_grid2]})
    """3SGDRL2"""
    me3 = SGDRegressor(alpha=0.0001,
                       average=False,
                       epsilon=0.1,
                       eta0=0.01,
                       fit_intercept=True,
                       l1_ratio=0.15,
                       learning_rate='invscaling',
                       loss='squared_loss',
                       max_iter=1000,
                       penalty='l2',
                       power_t=0.25,
                       random_state=0,
                       shuffle=True,
                       tol=0.01,
                       verbose=0,
                       warm_start=False)
    cv3 = 5
    scoring3 = 'r2'
    param_grid3 = [{
        'alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-05],
        'loss': ['squared_loss', "huber"],
        "penalty": ["l1", "l2"]
    }]
    dict_method.update({'SGDR-L1': [me3, cv3, scoring3, param_grid3]})
    """PassiveAggressiveRegressor"""
    me14 = PassiveAggressiveRegressor(C=1.0,
                                      fit_intercept=True,
                                      max_iter=1000,
                                      tol=0.001,
                                      early_stopping=False,
                                      validation_fraction=0.1,
                                      n_iter_no_change=5,
                                      shuffle=True,
                                      verbose=0,
                                      loss='epsilon_insensitive',
                                      epsilon=0.1,
                                      random_state=None,
                                      warm_start=False,
                                      average=False)
    cv14 = 5
    scoring14 = 'r2'
    param_grid14 = [{
        'C': [1.0e8, 1.0e6, 10000, 100, 50, 10, 5, 2.5, 1, 0.5, 0.1, 0.01]
    }]
    dict_method.update({'PAR-L1': [me14, cv14, scoring14, param_grid14]})

    return dict_method
Beispiel #26
0
import numpy as np
from sklearn.ensemble import RandomForestRegressor as rf
from sklearn.preprocessing import MinMaxScaler
from sklearn import neighbors

if __name__ == '__main__':

    # Load data
    df = pd.read_csv('dataset oversampling.csv')

    X = df.drop(['Mn', 'MWD'], axis=1)
    Y1 = df['Mn']
    Y2 = df['MWD']

    # Train the dataset with optimized models
    knn = neighbors.KNeighborsRegressor(n_neighbors=1, weights='uniform')
    rfr = rf(max_depth=6,
             max_features='sqrt',
             min_samples_split=2,
             n_estimators=200)

    min_max_scaler = MinMaxScaler()
    X_nor = min_max_scaler.fit_transform(X)
    knn.fit(X_nor, Y1)
    rfr.fit(X, Y2)

    # Generate the combinatorial condition pool
    conditions = [[
        M, M_CTA_1 * M_CTA_2, PC_M_1 * PC_M_2, 1, 0, 0, 0, 0, 0, 0, time
    ] for M in np.arange(0.1, 8, step=0.1)
                  for M_CTA_1 in np.arange(1, 9, step=1)
Beispiel #27
0
pc_job = []

K80_node = 'c2180'
V100_node = 'd1024'
host_node = 'c0168'
testcase = args.tc
### also, change .h5 file folder in jobs ###

INTERVAL = 30  # make decision every 30s

######################### do a regression fit ########################
with open('x_data.json') as f:
    x_train = json.load(f)
with open('y_data.json') as f:
    y_train = json.load(f)
model = neighbors.KNeighborsRegressor(n_neighbors=1, weights='distance')
model.fit(x_train, y_train)

####################################################################


def send_signal(node, cmd):
    # Create a TCP/IP socket
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    port = 10000 if node == K80_node else 10001
    # Connect the socket to the port where the server is listening
    server_address = (node, int(port))

    print('connecting to {} port {}'.format(*server_address))
    sock.connect(server_address)
def plot():
    map_path = file_path + "/resources/sf_block_groups/sf_block_groups_nowater.geojson"
    coc_path = file_path + "/resources/sf_block_groups/coc"
    plot_path = file_path + "/resources/sf_data/sf_overspace_plot_data.json"
    fig_path = file_path + "/results/sf_change_overspace.pdf"

    # Read data.
    with open(plot_path, "r") as plot_file:
        data = json.loads(plot_file.read().strip("\n"))
    
    coc = gpd.read_file(coc_path)
    coc = coc[coc["GEOID"].astype("int") - coc["GEOID"].astype("int") % 1000000 == 6075000000]
    coc = coc[coc["GEOID"].astype("int") != 6075017902]
    coc = coc[coc["COCFLAG__1"] == 1]
    coc = coc.to_crs({"init": "epsg:4326"})

    map = gpd.read_file(map_path)
    map["geoid"] = map["stfid"].astype("int")
    map = map[["geoid", "geometry"]]
    map["bg_lng"] = map.centroid.apply(lambda p: p.x)
    map["bg_lat"] = map.centroid.apply(lambda p: p.y)
    map = map[map["geoid"] != 60750179021]

    # Get supply curve data
    sup = pd.DataFrame.from_dict(data["sup"])
    sup["geoid"] = data["index"]
    sup = sup[sup["geoid"] != 60750601001]
    sup = sup[sup["geoid"] != 60750604001]
    sup = sup[sup["geoid"] != 60750332011]
    sup = sup[sup["geoid"] != 60750610002]
    sup = sup[sup["geoid"] != 60750264022]
    sup = sup[sup["geoid"] != 60750258002]
    sup[sup["geoid"] == 60750610001] = 1
    sup = map.merge(sup, on = "geoid", how = "left")

    # Get price curve data
    pri = pd.DataFrame.from_dict(data["pri"])
    pri["geoid"] = data["index"]
    pri = map.merge(pri, on = "geoid", how = "left")

    # Plot parameter and setting.
    font = FontProperties()
    font.set_weight("bold")
    font.set_size(10)
    matplotlib.rcParams.update({"font.size": 6})
    alpha = 0.5
    alpha2 = 0.3
    k = 2
    bar_cons = 0.66
    bar_mv = 0.27
    for i in [0, 1, 2, 3, 4]:
        ax[i].set_xlim([-122.513, -122.355])
        ax[i].set_ylim([37.707, 37.833])
        ax[i].set_axis_off()
        ax[i].xaxis.set_major_locator(plt.NullLocator())
        ax[i].yaxis.set_major_locator(plt.NullLocator())
        coc.plot(ax = ax[i], linewidth = 0.5, alpha = 0)
    app_list = ["uber", "lyft", "taxi"]
    cmap = "RdYlGn"

    f = 0
    for i in [0, 1, 2]:
        sup["plot"] = sup[app_list[i]] #/ sup["area"] * 581
        knn = neighbors.KNeighborsRegressor(k, "distance") # Fill empty area.
        train_x = sup[["plot", "bg_lat", "bg_lng"]].dropna()[["bg_lat", "bg_lng"]].values
        train_y = sup["plot"].dropna().values
        predict_x = sup[["bg_lat", "bg_lng"]].values
        sup["plot"] = knn.fit(train_x, train_y).predict(predict_x)
        vmin = sup["plot"].min()
        vmax = sup["plot"].quantile(0.95)
        # plot
        sup.plot(ax = ax[i], linewidth = 0, column = "plot", cmap = cmap,
            alpha = alpha, k = 10, vmin = vmin, vmax = vmax)
        ax[i].set_title(upperfirst(app_list[i]) + " Supply", fontproperties = font)
        fig = ax[i].get_figure()
        cax = fig.add_axes([0.128 + 0.16 * i, 0.07, 0.12, 0.02])
        sm = plt.cm.ScalarMappable(cmap = cmap, norm = plt.Normalize(vmin = vmin, vmax = vmax))
        sm._A = []
        fig.colorbar(sm, cax = cax, alpha = alpha2, extend = "both", orientation = "horizontal")

    cmap = "RdYlGn_r"
    f = 2
    for i in [3, 4]:
        pri["plot"] = (pri[app_list[i - 3]] - 1) * 100
        knn = neighbors.KNeighborsRegressor(k, "distance") # Fill empty area.
        train_x = pri[["plot", "bg_lat", "bg_lng"]].dropna()[["bg_lat", "bg_lng"]].values
        train_y = pri["plot"].dropna().values
        predict_x = pri[["bg_lat", "bg_lng"]].values
        pri["plot"] = knn.fit(train_x, train_y).predict(predict_x)
        vmin = 0
        vmax = 12
        # plot
        pri.plot(ax = ax[i], linewidth = 0, column = "plot", cmap = cmap,
            alpha = alpha, k = 10, vmin = vmin, vmax = vmax)
        ax[i].set_title(upperfirst(app_list[i - 3]) + " Price", fontproperties = font)
        fig = ax[i].get_figure()
        cax = fig.add_axes([0.128 + 0.16 * i, 0.07, 0.12, 0.02])
        sm = plt.cm.ScalarMappable(cmap = cmap, norm = plt.Normalize(vmin = vmin, vmax = vmax))
        sm._A = []
        fig.colorbar(sm, cax = cax, alpha = alpha2, extend = "both", orientation = "horizontal")

    map_path = file_path + "/resources/nyc_block_groups/nyc_bg_with_data_acs15.geojson"
    plot_path = file_path + "/resources/nyc_data/nyc_overspace_plot_data.json"
    fig_path = file_path + "/results/nyc_change_overspace.pdf"
    
    # Read data.
    with open(plot_path, "r") as plot_file:
        data = json.loads(plot_file.read().strip("\n"))

    map = gpd.read_file(map_path)
    coc = map.sort_values("income")[:80]
    map = map[map["population"].astype("float") > 10.0]
    map["geoid"] = map["geo_id"].astype("int")
    map = map[["geoid", "geometry"]]
    map["bg_lng"] = map.centroid.apply(lambda p: p.x)
    map["bg_lat"] = map.centroid.apply(lambda p: p.y)

    # Get supply curve data
    sup = pd.DataFrame.from_dict(data["sup"])
    sup["geoid"] = data["index"]
    sup = map.merge(sup, on = "geoid", how = "left")
    
    # Get price curve data
    pri = pd.DataFrame.from_dict(data["pri"])
    pri["geoid"] = data["index"]
    pri = pri[pri["uber"] > 1.0]
    pri = pri[pri["lyft"] > 1.0]
    pri = map.merge(pri, on = "geoid", how = "left")
Beispiel #29
0

'''最近邻回归
KNN算法不仅可以用于分类,还可以用于回归。通过找出一个样本的k个最近邻居,
将这些邻居的某个(些)属性的平均值赋给该样本,就可以得到该样本对应属性的值。
'''
np.random.seed(0)
X = np.sort(5 * np.random.rand(40, 1), axis=0)
T = np.linspace(0, 5, 500)[:, np.newaxis]
y = np.sin(X).ravel()

y[::5] += 1 * (0.5 - np.random.rand(8)) #每隔5个,y上面增加噪音
n_neighbors = 5

for i, weights in enumerate(['uniform', 'distance']):
    knn = neighbors.KNeighborsRegressor(n_neighbors, weights=weights)
    y_ = knn.fit(X, y).predict(T)

    plt.subplot(2, 1, i + 1)
    plt.scatter(X, y, color='darkorange', label='data')
    plt.plot(T, y_, color='navy', label='prediction')
    plt.axis('tight')
    plt.legend()
    plt.title("KNeighborsRegressor (k = %i, weights = '%s')" % (n_neighbors,
                                                                weights))
plt.tight_layout()
plt.show()

'''
邻域成分分析NCA,NeighborhoodComponentsAnalysis,其目的是提高最近邻分类相对于标准欧氏距离的准确性。该算法直接最大化训练集上k近邻(KNN)得分的随机变量,还可以拟合数据的低维线性投影
可以自然地处理多类问题,而不需要增加模型的大小,并且不引入需要用户进行微调的额外参数。
Beispiel #30
0
    def build_surrogate(self):
        """ Build a surrogate. Multiple options for models are available including:
            -Gaussian Processes
            -KNN
            -SVR
            
            Assumptions:
            None
            
            Source:
            N/A
            
            Inputs:
            state [state()]
            
            Outputs:
            self.sfc_surrogate    [fun()]
            self.thrust_surrogate [fun()]
            
            Properties Used:
            Defaulted values
        """     
        
        # unpack
        pycycle_problem = self.model
        
        
        pycycle_problem.set_solver_print(level=-1)
        pycycle_problem.set_solver_print(level=2, depth=0)        
        
        
        # Extract the data
        # Create lists that will turn into arrays
        Altitudes = []
        Machs     = []
        PCs       = []
        Thrust    = []
        TSFC      = []
        
        
        # if we added fc.dTS this would handle the deltaISA
        
        throttles = self.evaluation_throttles*1.

        for MN, alt in self.evaluation_mach_alt: 
    
            print('***'*10)
            print(f'* MN: {MN}, alt: {alt}')
            print('***'*10)
            pycycle_problem['OD_full_pwr.fc.MN'] = MN
            pycycle_problem['OD_full_pwr.fc.alt'] = alt
            pycycle_problem['OD_part_pwr.fc.MN'] = MN
            pycycle_problem['OD_part_pwr.fc.alt'] = alt
    
            for PC in throttles: 
                print(f'## PC = {PC}')
                pycycle_problem['OD_part_pwr.PC']  = PC
                pycycle_problem.run_model()
                #Save to our list for SUAVE
                Altitudes.append(alt)
                Machs.append(MN)
                PCs.append(PC)
                TSFC.append(pycycle_problem['OD_part_pwr.perf.TSFC'][0])
                Thrust.append(pycycle_problem['OD_part_pwr.perf.Fn'][0])

            throttles = np.flip(throttles)

        # Now setup into vectors
        Altitudes = np.atleast_2d(np.array(Altitudes)).T * Units.feet
        Mach      = np.atleast_2d(np.array(Machs)).T
        Throttle  = np.atleast_2d(np.array(PCs)).T
        thr       = np.atleast_2d(np.array(Thrust)).T * Units.lbf
        sfc       = np.atleast_2d(np.array(TSFC)).T   * Units['lbm/hr/lbf'] # lbm/hr/lbf converted to (kg/N/s)
        
        
        # Once we have the data the model must be deleted because pycycle models can't be deepcopied
        self.pop('model')
        
        # Concatenate all together and things will start to look like the propuslor surrogate soon
        my_data = np.concatenate([Altitudes,Mach,Throttle,thr,sfc],axis=1)
        
        if self.save_deck :
            # Write an engine deck
            np.savetxt("pyCycle_deck.csv", my_data, delimiter=",")
        
        print(my_data)
        
        # Clean up to remove redundant lines
        b = np.ascontiguousarray(my_data).view(np.dtype((np.void, my_data.dtype.itemsize * my_data.shape[1])))
        _, idx = np.unique(b, return_index=True)
       
        my_data = my_data[idx]                
   
        xy  = my_data[:,:3] # Altitude, Mach, Throttle
        thr = np.transpose(np.atleast_2d(my_data[:,3])) # Thrust
        sfc = np.transpose(np.atleast_2d(my_data[:,4]))  # SFC        
        
        self.altitude_input_scale = np.max(xy[:,0])
        self.thrust_input_scale   = np.max(thr)
        self.sfc_input_scale      = np.max(sfc)
        
        # normalize for better surrogate performance
        xy[:,0] /= self.altitude_input_scale
        thr     /= self.thrust_input_scale
        sfc     /= self.sfc_input_scale
       
       
        # Pick the type of process
        if self.surrogate_type  == 'gaussian':
            gp_kernel = Matern()
            regr_sfc = gaussian_process.GaussianProcessRegressor(kernel=gp_kernel)
            regr_thr = gaussian_process.GaussianProcessRegressor(kernel=gp_kernel)      
            thr_surrogate = regr_thr.fit(xy, thr)
            sfc_surrogate = regr_sfc.fit(xy, sfc)  
           
        elif self.surrogate_type  == 'knn':
            regr_sfc = neighbors.KNeighborsRegressor(n_neighbors=1,weights='distance')
            regr_thr = neighbors.KNeighborsRegressor(n_neighbors=1,weights='distance')
            sfc_surrogate = regr_sfc.fit(xy, sfc)
            thr_surrogate = regr_thr.fit(xy, thr)  
   
        elif self.surrogate_type  == 'svr':
            regr_thr = svm.SVR(C=500.)
            regr_sfc = svm.SVR(C=500.)
            sfc_surrogate  = regr_sfc.fit(xy, sfc)
            thr_surrogate  = regr_thr.fit(xy, thr)    
           
        elif self.surrogate_type == 'linear':
            regr_thr = linear_model.LinearRegression()
            regr_sfc = linear_model.LinearRegression()          
            sfc_surrogate  = regr_sfc.fit(xy, sfc)
            thr_surrogate  = regr_thr.fit(xy, thr)
            
        else:
            raise NotImplementedError('Selected surrogate method has not been implemented')
       
       
        if self.thrust_anchor is not None:
            cons = deepcopy(self.thrust_anchor_conditions)
            cons[0,0] /= self.altitude_input_scale
            base_thrust_at_anchor = thr_surrogate.predict(cons)
            self.thrust_anchor_scale = self.thrust_anchor/(base_thrust_at_anchor*self.thrust_input_scale)
            
        if self.sfc_anchor is not None:
            cons = deepcopy(self.sfc_anchor_conditions)
            cons[0,0] /= self.altitude_input_scale
            base_sfc_at_anchor = sfc_surrogate.predict(cons)
            self.sfc_anchor_scale = self.sfc_anchor/(base_sfc_at_anchor*self.sfc_input_scale)
       
        # Save the output
        self.sfc_surrogate    = sfc_surrogate
        self.thrust_surrogate = thr_surrogate