Ejemplo n.º 1
0
class CustomScaler(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.scaler = StandardScaler(*args, **kwargs)
        self.cont_col_names = [
            'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
            'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
            'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2',
            'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
            'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath',
            'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
            'FireplaceQu', 'GarageYrBlt', 'GarageCars', 'GarageArea',
            'GarageCond', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
            'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold'
        ]

    # takes X_enc, a pandas dataframe where discrete vars are one hot encoded
    def fit(self, X, y=None):
        self.scaler.fit(X[self.cont_col_names], y)
        return self

    # takes X_enc, a pandas dataframe where discrete vars are one hot encoded
    def transform(self, X, y=None, copy=None):
        continuous_cols = self.scaler.transform(X[self.cont_col_names])
        discrete_cols = X.drop(columns=self.cont_col_names).values

        return np.concatenate([continuous_cols, discrete_cols], axis=1)

    def get_params(self, *args, **kwargs):
        return self.scaler.get_params(*args, **kwargs)
Ejemplo n.º 2
0
def standard_scaler(df: pd.DataFrame,
                    columns_to_scale: List[str]) -> LearnerReturnType:
    """
    Fits a standard scaler to the dataset.

    Parameters
    ----------

    df : pandas.DataFrame
        A Pandas' DataFrame with columns to scale.
        It must contain all columns listed in `columns_to_scale`.

    columns_to_scale : list of str
        A list of names of the columns for standard scaling.
    """

    scaler = StandardScaler()

    scaler.fit(df[columns_to_scale].values)

    def p(new_data_set: pd.DataFrame) -> pd.DataFrame:
        new_data = scaler.transform(new_data_set[columns_to_scale].values)
        new_cols = pd.DataFrame(data=new_data, columns=columns_to_scale).to_dict('list')
        return new_data_set.assign(**new_cols)

    p.__doc__ = learner_pred_fn_docstring("standard_scaler")

    log = {'standard_scaler': {
        'standard_scaler': scaler.get_params(),
        'transformed_column': columns_to_scale}}

    return p, p(df), log
def trainAndEvaluate(dataset, dataset_test, n_components=40, dimReduction='PCA', classifier="SVC", preproc_speaker=False):

    ## Pre processing
    if(preproc_speaker):
        print("preprocessing has been computed previously")

    else:
        standard_scaler = StandardScaler()
        standard_scaler.fit(dataset['data'])
        dataset['data'] = standard_scaler.fit_transform(dataset['data'], standard_scaler.get_params())
        dataset_test['data'] = standard_scaler.transform(dataset_test['data'], standard_scaler.get_params())

#    ## Dimensionality reduction
#    
#    
#    if (dimReduction == 'LDA'):
#        dimRed = LinearDiscriminantAnalysis(n_components=n_components)
#        dataset['data'] = dimRed.fit(dataset['data'], dataset['target']).transform(dataset['data'])
#        dataset_test['data'] = dimRed.transform(dataset_test['data'])
#    else:
#        if (dimReduction == 'PCA'):
#            dimRed = PCA(n_components=n_components)
#        elif (dimReduction == 'FA'):
#            dimRed = FeatureAgglomeration(n_clusters=n_components)
#        dataset['data'] = dimRed.fit_transform(dataset['data'])
#        dataset_test['data'] = dimRed.transform(dataset_test['data'])

    ## Classifier initialisation
    if (classifier == 'SVC'):
        clf = SVC(C=1, class_weight='balanced', verbose=1, probability=True)
    elif (classifier == 'kNN'):
        clf = neighbors.KNeighborsClassifier(n_neighbors=10)
    elif (classifier == 'tree'):
        clf = DecisionTreeClassifier(class_weight ='balanced', random_state=1)

    
    print("Training...")
    clf.fit(dataset['data'], dataset['target'])
    print("Prediciting...")
    predicted = clf.predict(dataset_test['data'])
    
    report = classification_report(dataset_test['target'], predicted, target_names=dataset_test['target_names'])
    
    accuracy = np.mean(predicted == dataset_test['target'])
    cnf_matrix = confusion_matrix(dataset_test['target'], predicted)

    return accuracy, cnf_matrix, report
def stand():
    # 标准化
    data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
    standard = StandardScaler()
    temp = standard.fit_transform(data)
    print("样本均值:", standard.mean_)
    print("样本方差:", standard.var_)
    print("样本参数:", standard.get_params())
    print(temp)
    return None
Ejemplo n.º 5
0
def standard():
    """
    Method to load a zero mean and unit variance StandardScaler

    RETURN:
    scaler
    """
    scaler = StandardScaler(copy=True)
    utils.display_get_params('StandardScaler Description', scaler.get_params())
    return (scaler)
Ejemplo n.º 6
0
class StandardScaler(FeatureTransformAlgorithm):
    r"""Implementation of feature standard scaling algorithm.
    
    Date:
        2020

    Author:
        Luka Pečnik

    License:
        MIT
    
    Documentation:
        https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

    See Also:
        * :class:`niaaml.preprocessing.feature_transform.FeatureTransformAlgorithm`
    """
    Name = 'Standard Scaler'

    def __init__(self, **kwargs):
        r"""Initialize StandardScaler.
        """
        super(StandardScaler, self).__init__()
        self.__std_scaler = StdScaler()

    def fit(self, x, **kwargs):
        r"""Fit implemented transformation algorithm.

        Arguments:
            x (pandas.core.frame.DataFrame): n samples to fit transformation algorithm.
        """
        self.__std_scaler.fit(x)

    def transform(self, x, **kwargs):
        r"""Transforms the given x data.

        Arguments:
            x (pandas.core.frame.DataFrame): Data to transform.

        Returns:
            pandas.core.frame.DataFrame: Transformed data.
        """

        return self.__std_scaler.transform(x)

    def to_string(self):
        r"""User friendly representation of the object.

        Returns:
            str: User friendly representation of the object.
        """
        return FeatureTransformAlgorithm.to_string(self).format(
            name=self.Name,
            args=self._parameters_to_string(self.__std_scaler.get_params()))
Ejemplo n.º 7
0
class ScalingImplementation(EncodedInvariantImplementation):
    """ Class for application of Scaling operation on data,
    where only not encoded features (were not converted from categorical using
    OneHot encoding) are used

    :param params: optional, dictionary with the arguments
    """
    def __init__(self, **params: Optional[dict]):
        super().__init__()
        if not params:
            # Default parameters
            self.operation = StandardScaler()
        else:
            self.operation = StandardScaler(**params)
        self.params = params

    def get_params(self):
        return self.operation.get_params()
Ejemplo n.º 8
0
def standard_scaler(df: pd.DataFrame,
                    columns_to_scale: List[str]) -> LearnerReturnType:
    """
    Fits a standard scaler to the dataset.

    The default behaviour is to replace the original values. To store
    the transformed values in a new column, specify `prefix` or `suffix`
    in the parameters, or specify a dictionary with the desired column
    mapping using the `columns_mapping` parameter.

    Parameters
    ----------

    df : pandas.DataFrame
        A Pandas' DataFrame with columns to scale.
        It must contain all columns listed in `columns_to_scale`.

    columns_to_scale : list of str
        A list of names of the columns for standard scaling.
    """

    scaler = StandardScaler()

    scaler.fit(df[columns_to_scale].values)

    def p(new_data_set: pd.DataFrame) -> pd.DataFrame:
        new_data = scaler.transform(new_data_set[columns_to_scale].values)
        new_cols = pd.DataFrame(data=new_data,
                                columns=columns_to_scale).to_dict('list')
        return new_data_set.assign(**new_cols)

    p.__doc__ = learner_pred_fn_docstring("standard_scaler")

    log = {
        'standard_scaler': {
            'standard_scaler': scaler.get_params(),
            'transformed_column': columns_to_scale
        }
    }

    return p, p(df), log
Ejemplo n.º 9
0
def scale_data(trainX, testX):
    """
        Scale data 2D
         :param trainX: (array)
        :param testX: (array)
        :return:
                trainX: (array)
                testX: (array)
        """
    # remove overlap
    cut = int(trainX.shape[1] / 2)
    longX = trainX[:, -cut:, :]
    # flatten windows
    longX = longX.reshape((longX.shape[0] * longX.shape[1], longX.shape[2]))
    # flatten train and test
    flatTrainX = trainX.reshape(
        (trainX.shape[0] * trainX.shape[1], trainX.shape[2]))
    flatTestX = testX.reshape(
        (testX.shape[0] * testX.shape[1], testX.shape[2]))
    # standardize
    s = StandardScaler()
    # fit on training data
    s.fit(longX)
    # print("MEAN:")
    # print(s.mean_)
    # print("------------------------------------------")
    # print("VAR:")
    # print(s.var_)
    # print("------------------------------------------")
    # print("STD:")
    # print(s.scale_)

    print(s.get_params(True))
    # apply to training and test data
    longX = s.transform(longX)
    flatTrainX = s.transform(flatTrainX)
    flatTestX = s.transform(flatTestX)
    # reshape
    flatTrainX = flatTrainX.reshape((trainX.shape))
    flatTestX = flatTestX.reshape((testX.shape))
    return flatTrainX, flatTestX
Ejemplo n.º 10
0
    def test_params(self):

        estimator = StandardScaler(with_mean=False)
        params = estimator.get_params()
        params.update(
            {'estimator': estimator, 'reshapes': None, 'sample_dim': None})

        # check params set in constructor
        wrapper = wrap(estimator)
        self.assertEqual(wrapper.get_params(), params)
        self.assertEqual(wrapper.with_mean, False)

        # check params set by attribute
        wrapper.with_std = False
        params.update({'with_std': False})
        self.assertEqual(wrapper.get_params(), params)

        # check params set with set_params
        wrapper.set_params(copy=False)
        params.update({'copy': False})
        self.assertEqual(wrapper.get_params(), params)
Ejemplo n.º 11
0
    def normalize_xs(self):
        """
        Standarization 2D data
        """
        cut = int(self.x_train.shape[1] / 2)
        longX = self.x_train[:, -cut:, :]
        # flatten windows
        longX = longX.reshape(
            (longX.shape[0] * longX.shape[1], longX.shape[2]))
        # flatten train and test
        flatTrainX = self.x_train.reshape(
            (self.x_train.shape[0] * self.x_train.shape[1],
             self.x_train.shape[2]))
        flatTestX = self.x_test.reshape(
            (self.x_test.shape[0] * self.x_test.shape[1],
             self.x_test.shape[2]))
        # standardize
        s = StandardScaler()
        # fit on training data
        s.fit(longX)
        print("MEAN:")
        print(s.mean_)
        print("------------------------------------------")
        print("VAR:")
        print(s.var_)
        print("------------------------------------------")
        print("STD:")
        print(s.scale_)

        print(s.get_params(True))
        # apply to training and test data
        longX = s.transform(longX)
        flatTrainX = s.transform(flatTrainX)
        flatTestX = s.transform(flatTestX)
        # reshape
        self.x_train = flatTrainX.reshape((self.x_train.shape))
        self.x_test = flatTestX.reshape((self.x_test.shape))
Ejemplo n.º 12
0
    def test_params(self):

        estimator = StandardScaler(with_mean=False)
        params = estimator.get_params()
        params.update({
            "estimator": estimator,
            "reshapes": None,
            "sample_dim": None
        })

        # check params set in constructor
        wrapper = wrap(estimator)
        self.assertEqual(wrapper.get_params(), params)
        self.assertEqual(wrapper.with_mean, False)

        # check params set by attribute
        wrapper.with_std = False
        params.update({"with_std": False})
        self.assertEqual(wrapper.get_params(), params)

        # check params set with set_params
        wrapper.set_params(copy=False)
        params.update({"copy": False})
        self.assertEqual(wrapper.get_params(), params)
Ejemplo n.º 13
0
                        spatial_size=FeatureVectorConfig.SPATIALSIZE,
                        hist_feat=FeatureVectorConfig.HISTOGRAMFEATURES,
                        hist_bins=FeatureVectorConfig.HISTOGRAMBINS,
                        hog_feat=FeatureVectorConfig.HOGFEATURES)

t2 = time.time()
print(round(t2-t, 2), 'Seconds to extract HOG features...')
# Create an array stack of feature vectors
X = np.vstack((car_features, notcar_features)).astype(np.float64)                        
# Fit a per-column scaler
X_scaler = StandardScaler().fit(X)
# Apply the scaler to X
scaled_X = X_scaler.transform(X)

# save the scaler
print('X_scaler: ', X_scaler, ", get_params:", X_scaler.get_params(deep=True), ", mean:", X_scaler.mean_, ", std:", X_scaler.std_)
print('saving scaler to: ', SCALERFILENAME)
#SaveAndRestoreClassifier.saveScalerFitX(X, SCALERFILENAME)
SaveAndRestoreClassifier.saveScaler(X_scaler, SCALERFILENAME)
# Define the labels vector
y = np.hstack((np.ones(len(car_features)), np.zeros(len(notcar_features))))


# Split up data into randomized training and test sets
rand_state = np.random.randint(0, 100)
X_train, X_test, y_train, y_test = train_test_split(
    scaled_X, y, test_size=0.2, random_state=rand_state)

print('Using:',FeatureVectorConfig.ORIENTATIONBINS,'orientations',FeatureVectorConfig.PIXELSPERCELL,
    'pixels per cell and', FeatureVectorConfig.CELLSPERBLOCK,'cells per block')
print('Feature vector length:', len(X_train[0]))
def standardization_speaker(df_train, df_test):
    
    standard_scaler = StandardScaler()
    
    # SP1
    df_train1 = df_train[(df_train.target_names=="Sp1")]
    features = df_train1.loc[:, df_train1.columns != 'name']
    features = features.loc[:, features.columns != 'target_names']
    features = features.loc[:, features.columns != 'language']
    
    column_names = features.columns.values.tolist()
    standard_scaler.fit(features)
    
    df_train1 = standard_scaler.fit_transform(features, standard_scaler.get_params())
    
    df_test1 = df_test[(df_test.target_names=="Sp1")]
    features = df_test1.loc[:, df_test1.columns != 'name']
    features = features.loc[:, features.columns != 'target_names']
    features = features.loc[:, features.columns != 'language']
    
    df_test1 = standard_scaler.transform(features, standard_scaler.get_params())
    
    
    # SP2
    df_train2 = df_train[(df_train.target_names=="Sp2")]
    features = df_train2.loc[:, df_train2.columns != 'name']
    features = features.loc[:, features.columns != 'target_names']
    features = features.loc[:, features.columns != 'language']
    
    standard_scaler.fit(features)
    df_train2 = standard_scaler.fit_transform(features, standard_scaler.get_params())
    
    df_test2 = df_test[(df_test.target_names=="Sp2")]
    features = df_test2.loc[:, df_test2.columns != 'name']
    features = features.loc[:, features.columns != 'target_names']
    features = features.loc[:, features.columns != 'language']
    
    df_test2 = standard_scaler.transform(features, standard_scaler.get_params())
    
    # SP3
    df_train3 = df_train[(df_train.target_names=="Sp3")]
    features = df_train3.loc[:, df_train3.columns != 'name']
    features = features.loc[:, features.columns != 'target_names']
    features = features.loc[:, features.columns != 'language']
    
    standard_scaler.fit(features)
    df_train3 = standard_scaler.fit_transform(features, standard_scaler.get_params())
    
    df_test3 = df_test[(df_test.target_names=="Sp3")]
    features = df_test3.loc[:, df_test3.columns != 'name']
    features = features.loc[:, features.columns != 'target_names']
    features = features.loc[:, features.columns != 'language']
    
    df_test3 = standard_scaler.transform(features, standard_scaler.get_params())
    
    #To dataframe type
    df_train1 = pd.DataFrame(data = df_train1, columns = column_names)
    df_train2 = pd.DataFrame(data = df_train2, columns = column_names)
    df_train3 = pd.DataFrame(data = df_train3, columns = column_names)
    
    df_test1 = pd.DataFrame(data = df_test1, columns = column_names)
    df_test2 = pd.DataFrame(data = df_test2, columns = column_names)
    df_test3 = pd.DataFrame(data = df_test3, columns = column_names)
     
    #concat
    df_train_fin = pd.concat([df_train1, df_train2, df_train3])
    df_test_fin = pd.concat([df_test1, df_test2, df_test3])
    
    #Reindex
    df_train_fin.index = range(df_train_fin.shape[0])
    df_test_fin.index = range(df_test_fin.shape[0])
    
    #concat with final columns
    df_train_fin = pd.concat([df_train_fin, df_train.loc[:, df_train.columns == 'language'], df_train.loc[:, df_train.columns == 'name'],df_train.loc[:, df_train.columns == 'target_names']], axis = 1)
    df_test_fin = pd.concat([df_test_fin, df_test.loc[:, df_test.columns == 'language'], df_test.loc[:, df_test.columns == 'name'],df_test.loc[:, df_test.columns == 'target_names']], axis = 1)
    
    #randomize
    df_train=df_train_fin.sample(frac=1,random_state=1)
    df_test=df_test_fin.sample(frac=1,random_state=1)
    
    
    return df_train, df_test
Ejemplo n.º 15
0
scaler = StandardScaler()
means = np.mean(X_train)
std = np.std(X_train)
print means[0]
scaler.mean_ = np.zeros(len(means))
scaler.std_ = np.ones(len(means))
for i in range(len(means)):
    scaler.mean_[i] = means[i]
    scaler.std_[i] = std[i]
print scaler.mean_
#scaler.mean_ = 
#X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


print scaler.get_params(deep=True)
print scaler.mean_
print scaler.std_
sys.exit()
# Let's retrain a new model on the first subset call the **training set**:

# In[15]:
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.tree import DecisionTreeClassifier as DC



dt = DC(max_depth=3,min_samples_leaf=0.05*len(X_train))
abc = ABC(dt,algorithm='SAMME',
          n_estimators=8,
          learning_rate=0.5)
def forecast_call_ml(zipcode):
    # url= "https://api.worldweatheronline.com/premium/v1/weather.ashx?"
    zipcode = zipcode
    response = requests.get(
        f"{url}key={lw_key}&q={zipcode}&num_of_days=7&tp=24&mca=no&aqi=yes&format=json"
    ).json()
    response = response["data"]
    weather_dict = {
        "Dates": [],
        "Cloudcover": [],
        "Humidity": [],
        "PrecipInch": [],
        "Pressure": [],
        "FeelsLike": [],
        "HeatIndex": [],
        "MaxTemp": [],
        "MinTemp": [],
        "SunHours": [],
        "UVIndex": [],
    }

    weather_dict["Dates"] = ([
        response["weather"][i]["date"] for i in range(len(response["weather"]))
    ])
    weather_dict["Cloudcover"] = ([
        response["weather"][i]["hourly"][0]["cloudcover"]
        for i in range(len(response["weather"]))
    ])
    weather_dict["Humidity"] = ([
        response["weather"][i]["hourly"][0]["humidity"]
        for i in range(len(response["weather"]))
    ])
    weather_dict["PrecipInch"] = ([
        response["weather"][i]["hourly"][0]["precipInches"]
        for i in range(len(response["weather"]))
    ])
    weather_dict["Pressure"] = ([
        response["weather"][i]["hourly"][0]["pressure"]
        for i in range(len(response["weather"]))
    ])
    weather_dict["FeelsLike"] = ([
        response["weather"][i]["hourly"][0]["FeelsLikeF"]
        for i in range(len(response["weather"]))
    ])
    weather_dict["HeatIndex"] = ([
        response["weather"][i]["hourly"][0]["HeatIndexF"]
        for i in range(len(response["weather"]))
    ])
    weather_dict["MaxTemp"] = ([
        response["weather"][i]["maxtempF"]
        for i in range(len(response["weather"]))
    ])
    weather_dict["MinTemp"] = ([
        response["weather"][i]["mintempF"]
        for i in range(len(response["weather"]))
    ])
    weather_dict["SunHours"] = ([
        response["weather"][i]["sunHour"]
        for i in range(len(response["weather"]))
    ])
    weather_dict["UVIndex"] = ([
        response["weather"][i]["hourly"][0]["uvIndex"]
        for i in range(len(response["weather"]))
    ])
    weather_df = pd.DataFrame.from_dict(weather_dict,
                                        orient='index').transpose()
    weather_df = weather_df.apply(pd.to_numeric, errors='ignore')
    weather_df["TempDelta"] = weather_df.MaxTemp - weather_df.MinTemp
    weather_df["BarChange"] = weather_df["Pressure"].pct_change()
    weather_df["HeatChange"] = weather_df["HeatIndex"].pct_change()
    weather_df["HumChange"] = weather_df["Humidity"].pct_change()
    weather_df = weather_df.iloc[1:]
    new_migraine_df = weather_df.drop("Dates", axis=1)

    forecast_data = json.loads(weather_df.to_json(orient="records"))

    #Pull data from MongoDB
    collection = mongo.db.history
    history_df = pd.DataFrame(list(collection.find()))
    #Pre-Processing of Data
    hist_ml_df = history_df.drop(columns=["Dates", "_id", "index"])
    # Assign X (data) and y (target)
    X = hist_ml_df.drop("Migraine", axis=1)
    y = hist_ml_df["Migraine"]
    print(X.shape, y.shape)
    #Split our data into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    #Fit on the training data, use StandardScaler
    X_scaler = StandardScaler().fit(X_train)
    X_scaler.get_params()
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    print(f"y_train value counts: {y_train.value_counts}")
    print(f"y_test value counts: {y_test.value_counts}")
    #Model creation
    model = SVC(kernel="linear")
    # Create the GridSearch estimator along with a parameter object containing the values to adjust
    param_grid = {'C': [1, 5, 10, 50], 'gamma': [0.0001, 0.0005, 0.001, 0.005]}
    grid = GridSearchCV(model, param_grid, verbose=3)
    # Fit the model using the grid search estimator.This will take the SVC model and try each combination of parameters
    grid.fit(X_train_scaled, y_train)
    # List the best parameters for this dataset
    print(grid.best_params_)
    # List the best score
    print(grid.best_score_)
    # print(grid.score({X_test_scaled}, {y_test}))
    #Use model to make predictions with the hypertuned model
    predictions = grid.predict(X_test_scaled)
    # Run model on forecast data to formulate predictions
    X_new_scaled = X_scaler.transform(new_migraine_df)
    forecast_predictions = grid.predict(X_new_scaled)
    print(f"`forecast_predictions{forecast_predictions}")
    lists = forecast_predictions.tolist()
    json_str = json.dumps(lists)
    print("Predictions inserted")
    print(f'Forecast_Data{forecast_data}')

    # render an index.html template and pass it the data you retrieved from the database
    # return (f"We did it! Machine learning achieved! {forecast_data} {json_str}")
    # return render_template("results_index.html")
    return render_template("results_index.html",
                           forecast_predictions=json_str,
                           forecast_data=forecast_data)
Ejemplo n.º 17
0
#参考https://www.cnblogs.com/cola-1998/p/10218276.html
#参考https://blog.csdn.net/weixin_39175124/article/details/79463993
#参考https://blog.csdn.net/onthewaygogoing/article/details/79871559
from sklearn.preprocessing import StandardScaler
data = [[-1,0],[1,0],[1,1],[1,1]]
scaler = StandardScaler()
scaler.fit(data)
print(scaler.mean_)#求均值
print(scaler.scale_)#求标准差
#=====================================================================
import numpy as np 
import warnings
warnings.filterwarnings("ignore")#有个数据格式的警告,忽略掉
x_train = np.arange(10).reshape(5,2)
x_test = np.arange(3,7).reshape(2,2)
y = [1,0,0,0,1]
ss = StandardScaler(copy=True, with_mean=True, with_std=True) #调用StandardScaler类,此处参数为默认,copy 如果为false,就会用归一化的值替代原来的值,with_mean 在处理sparse CSR或者 CSC matrices 一定要设置False不然会超内存
print(x_train,x_test)
z = ss.fit_transform(x_train)#等同于ss.fit(x_train).transform(x_train)即先拟合x_train数据,然后将其标准化为均值为0、标准差为1的数据

w = ss.fit(x_train)#运行fit方法拟合得到均值和方差等参数,fit的第二个参数为y=标签数据,默认为None
print(ss.n_samples_seen_,ss.mean_,ss.var_,ss.scale_)
#参数解释:n_samples_seen_样本数量,mean_每个特征的均值,var_每个特征方差,scale_每个特征标准差
x_train = w.transform(x_train)
x_test = w.transform(x_test)#用训练集的拟合参数来标准化测试集,机器学习中有很多假设,这里假设了训练集的样本采样足够充分

print(z)
print(x_train,x_test)#转换后的训练和测试数据
#如果原始数据的分布 不 接近于一般正态分布,则标准化的效果会不好
print(ss.get_params(deep=True))#返回StandardScaler对象的设置参数
print(ss.inverse_transform(x_test,copy=True))#StandardScaler()会保存标准化参数并且逆向转换
def scaling(data):
    scaler = StandardScaler()
    data = scaler.fit_transform(data)
    dict_labels = dict()
    dict_labels["scaler"] = scaler.get_params(deep=True)
    return data, dict_labels
Ejemplo n.º 19
0
 def test_set_params(self):
     scaler = StandardScaler()
     wrapper = SKLearnWrapper(module=scaler)
     self.assertEqual(scaler.get_params()["with_mean"], True)
     wrapper.set_params(with_mean=False, )
     self.assertEqual(scaler.get_params()["with_mean"], False)
Ejemplo n.º 20
0
 def test_get_params(self):
     scaler = StandardScaler()
     wrapper = SKLearnWrapper(module=scaler)
     self.assertEqual(wrapper.get_params(), scaler.get_params())
#!/usr/bin/env python  
# encoding: utf-8 

""" 
@author: payneLi  
@time: 18-7-11 下午4:05
@email: [email protected]  

"""
from sklearn.preprocessing import StandardScaler

data = [[1., -1., 3.],
        [2., 4., 2.],
        [4., 6., -1.]]


ss = StandardScaler()

target = ss.fit_transform(data)

mean = ss.mean_
std = ss.var_
params = ss.get_params()

print("target:", target, "\nmean:", mean, "\nstd:", std, "\nparams:", params)
Ejemplo n.º 22
0
def parameter_runs(regressions, n_comp = 60):
    start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    df = pandas.read_csv("training_set_ltv.csv")

    # df = df[df["bookings_pre_first_ride"] <2]
    
    scales = ["StandardScaler"] #, "MinMax", "Robust"]

    tf = data_prep(df)
    
    print(tf["non_cancelled_rides_after_first"].value_counts())
    class_names =  ["return" if x > 0 else "one-time" for x in tf["non_cancelled_rides_after_first"]]


    tf_train, tf_train2, tf_test = data_cleanup.train_validate_test_split(tf
        ,train_percent = 0.8
        ,validate_percent = 0.1
        ,seed =20)


    # X_train, X_test, y_train, y_test, X_columns= data_cleanup.create_test_and_train_data(
    #     tf, "non_cancelled_rides_after_first")

    X_train, y_train, X_columns = data_cleanup.split_xyc(tf_train, "non_cancelled_rides_after_first")
    X_train2, y_train2, X_columns = data_cleanup.split_xyc(tf_train2, "non_cancelled_rides_after_first")
    X_test, y_test, X_columns = data_cleanup.split_xyc(tf_test, "non_cancelled_rides_after_first")

    scale = "StandardScaler"

    if scale == "StandardScaler":
        scaler = StandardScaler()
    elif scaler == "MinMax":
        scaler = MinMaxScaler()
    elif scaler == "Robust":
        scaler = RobustScaler()

    scaler.fit(X_train)  # Don't cheat - fit only on training data
    print (scaler.get_params())
    pickle.dump(scaler, open("data_scaler.pkl", "wb"))
    scaler_filename = "scaler.save"
    joblib.dump(scaler, scaler_filename) 

    # And now to load...
    # print (set(y_train))
    scaler = joblib.load(scaler_filename) 

    X_train = scaler.transform(X_train)


    # pca = PCA(n_components=n_comp, svd_solver='full')
    
    # pca.fit(X_train)
    # X_train = pca.transform(X_train)

    # X_test = pca.transform(scaler.transform(X_test))
    # X_train2 = pca.transform(scaler.transform(X_train2))


    

    selector = SelectKBest(k= kbest).fit(X_train, y_train)

    X_train = selector.transform(X_train)
    X_train2 = selector.transform(X_train2)
    X_test = selector.transform(X_test)

#    X_columns = X_columns[selector.get_support()]

    # create temporary dataframes to add predictions to
    X_train2_t = pandas.DataFrame(X_train2) #, columns = X_columns)
    X_test_t = pandas.DataFrame(X_test)# , columns = X_columns)

    for estimator_conf in regressions:
        input_name = "with_cc_"

        suffix = "_" + estimator_conf['name'].replace(" ", "_")  + "_"  + scale 

        output_name = input_name + suffix
        with open("test_log_" 
            + output_name
            + ".txt", "w") as test_log:


            print(output_name)
            # print (X_train)
            # print(len(X_train), len(X_train[0]))
            # print(len(X_test),len(X_test[0]), len(y_test))

            a= estimator_conf['instance'].fit(X_train, y_train)

            test_log.write(estimator_conf['name'] + "\n")

            output_name = input_name + "_" + estimator_conf['name'].replace(" ", "_")  
            pickle.dump(a, open(output_name + ".pkl", "wb"))
            
            #classification_visualizer(a, X_test, y_test, output_name)            
            test_log.write("score \t " +str(a.score(X_test, y_test)) + "\n")
            test_log.write("train performance\n")
            cnf_matrix =confusion_matrix(y_train, a.predict(X_train))
            test_log.write( str(cnf_matrix) + "\n" )
            test_log.write(str(
                cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis])+ "\n")
            test_log.write("test performance\n")
        
            cnf_matrix =confusion_matrix(y_test, a.predict(X_test))
            test_log.write(str( cnf_matrix) + "\n")
            test_log.write(str(
                (cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]))+ "\n")    

            


            training_manager.add_values_to_table("run_results"
                , [(output_name 
                    ,start_time # what time it was
                    ,str(a.score(X_test, y_test)) # classification score
                    ,int(cnf_matrix[0][0])
                    ,int(cnf_matrix[0][1])
                    ,int(cnf_matrix[1][0])
                    ,int(cnf_matrix[1][1])
                    )] , conn) 

            np.set_printoptions(precision=2)
            
            # Plot non-normalized confusion matrix
            plt.figure()
            plot_confusion_matrix(cnf_matrix, classes=class_names,
                                  title='Confusion matrix, without normalization')
            plt.savefig("cnf_m_" + output_name  + ".pdf")

            # Plot normalized confusion matrix
            plt.figure()
            plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                                  title='Normalized confusion matrix')
            plt.savefig("cnf_m_normed_" + output_name  + ".pdf")


            # print (len(y_proba(a, X_train)))
            # print (len(X_train))
            # print (type(X_train))
            # X_train["prediction_1"] = pandas.Series(y_proba(a, X_train))
            # X_test["prediction_" + output_name]  = y_proba(a, X_test)            



            X_train2_t[output_name + "_prediction"] = y_proba(a, X_train2) 

            X_test_t[output_name + "_prediction"] = y_proba(a, X_test)                 
            #y_test_dict[output_name] = y_proba(a, X_test) 




    good_cols = ["bookings_pre_first_ride"]

    good_cols = X_columns
    print (len(X_train2[0]))
    X_train2 = clean_df(X_train2_t, good_cols)

    print (len(X_train2.columns))
    X_test = clean_df(X_test_t, good_cols)

    for estimator_conf in regressions:

        input_name = "second_with_email_"

        suffix = "_" + estimator_conf['name'].replace(" ", "_")  + "_"  + scale + "_pca_" +str(n_comp)

        output_name = input_name + suffix
        with open("test_log_" 
            + output_name
            + ".txt", "w") as test_log:

            a= estimator_conf['instance'].fit(X_train2, y_train2)

            test_log.write(estimator_conf['name'] + "\n")

            output_name = input_name + "_" + estimator_conf['name'].replace(" ", "_")  
            pickle.dump(a, open(output_name + ".pkl", "wb"))
            
            classification_visualizer(a, X_test, y_test, output_name)            
            test_log.write("score \t " +str(a.score(X_test, y_test)) + "\n")
            test_log.write("train performance\n")
            cnf_matrix =confusion_matrix(y_train2, a.predict(X_train2))
            test_log.write( str(cnf_matrix) + "\n" )
            test_log.write(str(
                cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis])+ "\n")
            test_log.write("test performance\n")

            cnf_matrix =confusion_matrix(y_test, a.predict(X_test))
            test_log.write(str( cnf_matrix) + "\n")
            test_log.write(str(
                (cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]))+ "\n")    

            training_manager.add_values_to_table("run_results"
                , [(output_name 
                    ,start_time # what time it was
                    ,str(a.score(X_test, y_test)) # classification score
                    ,int(cnf_matrix[0][0])
                    ,int(cnf_matrix[0][1])
                    ,int(cnf_matrix[1][0])
                    ,int(cnf_matrix[1][1])
                    )] , conn) 

            np.set_printoptions(precision=2)
            
            # Plot non-normalized confusion matrix
            plt.figure()
            plot_confusion_matrix(cnf_matrix, classes=class_names,
                                  title='Confusion matrix, without normalization')
            plt.savefig("cnf_m_" + output_name  + ".pdf")

            # Plot normalized confusion matrix
            plt.figure()
            plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                                  title='Normalized confusion matrix')
            plt.savefig("cnf_m_normed_" + output_name  + ".pdf")