Exemple #1
0
            rhinoceros.append(col)
    print(df_train.info())
    print(df_train.head())
else:
    zebra = []
    rhinoceros = []
'''
Features combination
'''
if combination:
    print('combination')
    if poly:
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        output_array = poly.fit_transform(df_train.loc[:, 'f28':])
        df_output = pd.DataFrame(output_array,
                                 columns=poly.get_feature_names(
                                     df_train.columns['f28':]))
        print(df_output.info())
        print(df_output.head())
        sys.exit(0)

    donkey = []
    cat_zebra = False
    cat_rhinoceros = False
    date_cat = False
    f5_cat = False
    catD = True

    if cat_zebra:
        for col1 in cat:
            for col2 in zebra:
                df_train[col1 + col2] = df_train[col1] + df_train[col2] * 10
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
X_hour_week_onehot = enc.fit_transform(X_hour_week).toarray()
eval_on_features(X_hour_week_onehot, y, Ridge()) # use ridge - with regularization

# Pre_Processing 2: Polynomial Features
from sklearn.preprocessing import PolynomialFeatures
poly_transformer = PolynomialFeatures(degree=2, interaction_only=True,
										include_bias=False)
X_hour_week_onehot_poly = poly_transformer.fit_transform(X_hour_week_onehot) #如果是normalization不能train和test这么同时fit
lr = Ridge()
eval_on_features(X_hour_week_onehot_poly, y, lr)

## plot coefficients learned by the model (NA for random forest)
hour = ["%02d:00" % i for i in range(0, 24, 3)]
day = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
features = day + hour

# name all the interaction features, and keep only the features with nonzero coefficients:
features_poly = poly_transformer.get_feature_names(features)
features_nonzero = np.array(features_poly)[lr.coef_ != 0]
coef_nonzero = lr.coef_[lr.coef_ != 0]

# visualize the coefficients learned by the linear model
plt.figure(figsize=(15, 2))
plt.plot(coef_nonzero, 'o')
plt.xticks(np.arange(len(coef_nonzero)), features_nonzero, rotation=90)
plt.xlabel("Feature name")
plt.ylabel("Feature magnitude")
plt.show()
    def featureEngineer(self, data, ntrain):
        data.loc[(data.PoolArea > 0), ['MiscFeature']] = 'Pool'
        data.loc[(data.PoolArea > 0),
                 ['MiscVal']] = data.loc[(data.PoolArea > 0),
                                         ['MiscVal', 'PoolArea']].apply(
                                             lambda x:
                                             (x.MiscVal + x.PoolArea),
                                             axis=1)

        data[
            'TotalExtraPoints'] = data.HeatingQC + data.PoolQC + data.FireplaceQu + data.KitchenQual
        data['TotalPoints'] = (
            data.ExterQual + data.FireplaceQu + data.GarageQual +
            data.KitchenQual + data.BsmtQual + data.BsmtExposure +
            data.BsmtFinType1 + data.PoolQC + data.ExterCond + data.BsmtCond +
            data.GarageCond + data.OverallCond + data.BsmtFinType2 +
            data.HeatingQC) + data.OverallQual**2

        df = data.loc[(data.SalePrice > 0), [
            'TotalPoints', 'TotalExtraPoints', 'OverallQual', 'OverallCond',
            'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
            'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'PoolQC',
            'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond',
            'SalePrice'
        ]]

        data['GarageArea_x_Car'] = data.GarageArea * data.GarageCars

        data['TotalBsmtSF_x_Bsm'] = data.TotalBsmtSF * data['1stFlrSF']

        # We don´t have a feature with all construct area, maybe it is an interesting feature to create.
        data['ConstructArea'] = (data.TotalBsmtSF + data.WoodDeckSF +
                                 data.GrLivArea + data.OpenPorchSF +
                                 data.TSsnPorch + data.ScreenPorch +
                                 data.EnclosedPorch + data.MasVnrArea +
                                 data.GarageArea + data.PoolArea)

        # all_data['TotalArea'] = all_data.ConstructArea + all_data.LotArea

        data['Garage_Newest'] = data.YearBuilt > data.GarageYrBlt
        data.Garage_Newest = data.Garage_Newest.apply(lambda x: 1 if x else 0)

        data[
            'TotalPorchSF'] = data.OpenPorchSF + data.EnclosedPorch + data.TSsnPorch + data.ScreenPorch + data.WoodDeckSF
        data.EnclosedPorch = data.EnclosedPorch.apply(lambda x: 1 if x else 0)

        data['LotAreaMultSlope'] = data.LotArea * data.LandSlope

        data['BsmtSFPoints'] = (data.BsmtQual**2 + data.BsmtCond +
                                data.BsmtExposure + data.BsmtFinType1 +
                                data.BsmtFinType2)

        data['BsmtSFMultPoints'] = data.TotalBsmtSF * (
            data.BsmtQual**2 + data.BsmtCond + data.BsmtExposure +
            data.BsmtFinType1 + data.BsmtFinType2)

        data['TotBathrooms'] = data.FullBath + (
            data.HalfBath * 0.5) + data.BsmtFullBath + (data.BsmtHalfBath *
                                                        0.5)
        data.FullBath = data.FullBath.apply(lambda x: 1 if x else 0)
        data.HalfBath = data.HalfBath.apply(lambda x: 1 if x else 0)
        data.BsmtFullBath = data.BsmtFullBath.apply(lambda x: 1 if x else 0)
        data.BsmtHalfBath = data.BsmtHalfBath.apply(lambda x: 1 if x else 0)

        data.MSSubClass = data.MSSubClass.astype('str')
        data.MoSold = data.MoSold.astype('str')

        data, dummies = self.one_hot_encode(data)

        ZeroTest = data[dummies][ntrain:].sum() == 0
        data.drop(dummies[ZeroTest], axis=1, inplace=True)
        print('Dummins in test dataset with all observatios equal to 0:',
              len(dummies[ZeroTest]), 'of \n', dummies[ZeroTest], '\n')
        dummies = dummies.drop(dummies[ZeroTest])

        # Find dummies with all training observatiosn are equal to 0
        ZeroTest = data[dummies][:ntrain].sum() == 0
        data.drop(dummies[ZeroTest], axis=1, inplace=True)
        print('Dummins in trainig dataset with all observatios equal to 0:',
              len(dummies[ZeroTest]), 'of \n', dummies[ZeroTest], '\n')
        dummies = dummies.drop(dummies[ZeroTest])

        del ZeroTest

        data['Remod'] = 2
        data.loc[(data.YearBuilt == data.YearRemodAdd), ['Remod']] = 0
        data.loc[(data.YearBuilt != data.YearRemodAdd), ['Remod']] = 1

        #all_data['Age'] = all_data.YearRemodAdd - all_data.YrSold  # sice I convert both to age

        data["WasNew"] = 2
        data.loc[(data.YearBuilt == data.YrSold), ['WasNew']] = 1
        data.loc[(data.YearBuilt != data.YrSold), ['WasNew']] = 0

        data.drop([
            'FireplaceQu', 'BsmtSFPoints', 'TotalBsmtSF', 'GarageArea',
            'GarageCars', 'OverallQual', 'GrLivArea', 'TotalBsmtSF_x_Bsm',
            '1stFlrSF', 'PoolArea', 'LotArea', 'SaleCondition_Partial',
            'Exterior1st_VinylSd', 'GarageCond', 'HouseStyle_2Story',
            'BsmtSFMultPoints', 'ScreenPorch', 'LowQualFinSF', 'BsmtFinSF2',
            'TSsnPorch'
        ],
                  axis=1,
                  inplace=True)

        data.rename(columns={'2ndFlrSF': 'SndFlrSF'}, inplace=True)

        # Remove the higest correlations and run a multiple regression
        cols = data.columns
        print(cols)
        cols = cols.drop(['SalePrice'])
        #vif = self.VRF('SalePrice', all_data.loc[all_data.SalePrice > 0, cols], all_data.SalePrice[all_data.SalePrice > 0], cols)

        cols = cols.drop([
            'Condition1_PosN', 'Neighborhood_NWAmes', 'Exterior1st_CBlock',
            'BldgType_1Fam', 'RoofStyle_Flat', 'MSZoning_Call', 'Alley_Grvl',
            'LandContour_Bnk', 'LotConfig_Corner', 'GarageType_2Types',
            'MSSubClass_45', 'MasVnrType_BrkCmn', 'Foundation_CBlock',
            'MiscFeature_Gar2', 'SaleType_COD', 'Exterior2nd_CBlock'
        ])

        #vif = self.VRF('SalePrice', all_data.loc[all_data.SalePrice > 0, cols], all_data.SalePrice[all_data.SalePrice > 0], cols)

        cols = cols.drop([
            'PoolQC', 'BldgType_TwnhsE', 'BsmtFinSF1', 'BsmtUnfSF',
            'Electrical_SBrkr', 'Exterior1st_MetalSd', 'Exterior2nd_VinylSd',
            'GarageQual', 'GarageType_Attchd', 'HouseStyle_1Story',
            'MasVnrType_None', 'MiscFeature_NA', 'MSZoning_RL',
            'RoofStyle_Gable', 'SaleCondition_Normal', 'MoSold_10',
            'SaleType_New', 'SndFlrSF', 'TotalPorchSF', 'WoodDeckSF',
            'BldgType_Duplex', 'MSSubClass_90'
        ])

        print(cols)
        #print(vif)

        df_copy = data[data.SalePrice > 0].copy()

        data.CentralAir = data.CentralAir.astype('uint8')
        data.Garage_Newest = data.Garage_Newest.astype('uint8')
        data.EnclosedPorch = data.EnclosedPorch.astype('uint8')
        data.FullBath = data.FullBath.astype('uint8')
        data.HalfBath = data.HalfBath.astype('uint8')
        data.BsmtFullBath = data.BsmtFullBath.astype('uint8')
        data.BsmtHalfBath = data.BsmtHalfBath.astype('uint8')
        data.Remod = data.Remod.astype('uint8')
        data.WasNew = data.WasNew.astype('uint8')
        data.Street = data.Street.astype('uint8')  # orinal
        data.PavedDrive = data.PavedDrive.astype('uint8')  # ordinal
        data.Functional = data.Functional.astype('uint8')  # ordinal
        data.LandSlope = data.LandSlope.astype('uint8')  # ordinal

        numeric_features = list(
            data.loc[:, cols].dtypes[(data.dtypes != "category")
                                     & (data.dtypes != 'uint8')].index)
        '''
		with warnings.catch_warnings():
		    warnings.simplefilter("ignore", category=RuntimeWarning)
		'''
        skewed_features = data[numeric_features].apply(
            lambda x: skew(x.dropna())).sort_values(ascending=False)

        #compute skewness
        skewness = pd.DataFrame({'Skew': skewed_features})

        # Get only higest skewed features
        skewness = skewness[abs(skewness) > 0.7]
        skewness = skewness.dropna()

        l_opt = {}

        for feat in skewness.index:
            data[feat], l_opt[feat] = boxcox((data[feat] + 1))

        skewed_features2 = data[skewness.index].apply(
            lambda x: skew(x.dropna())).sort_values(ascending=False)

        #compute skewness
        skewness2 = pd.DataFrame({'New Skew': skewed_features2})

        y = data.SalePrice[data.SalePrice > 0]
        X = data.loc[data.SalePrice > 0, ['ConstructArea']]
        #self.poly(X, y, 'ConstructArea')

        X = data.loc[data.SalePrice > 0, ['ConstructArea', 'TotalPoints']]
        #self.poly(X, y)

        X = data.loc[data.SalePrice > 0, [
            'ConstructArea', 'TotalPoints', 'LotAreaMultSlope',
            'GarageArea_x_Car'
        ]]
        #self.poly(X, y)

        poly_cols = [
            'ConstructArea', 'TotalPoints', 'LotAreaMultSlope',
            'GarageArea_x_Car'
        ]

        pf = PolynomialFeatures(degree=3,
                                interaction_only=False,
                                include_bias=False)
        res = pf.fit_transform(data.loc[:, poly_cols])

        target_feature_names = [
            feat.replace(' ', '_') for feat in pf.get_feature_names(poly_cols)
        ]
        output_df = pd.DataFrame(res,
                                 columns=target_feature_names,
                                 index=data.index).iloc[:, len(poly_cols):]
        print('Polynomial Features included:', output_df.shape[1])
        # display(output_df.head())
        data = pd.concat([data, output_df], axis=1)
        print('Total Features after Polynomial Features included:',
              data.shape[1])
        colsP = output_df.columns

        del output_df, target_feature_names, res, pf

        y_train = (data.SalePrice[data.SalePrice > 0].reset_index(
            drop=True, inplace=False))
        #self.trainingData = all_data.loc[(all_data.SalePrice>0), cols].reset_index(drop=True, inplace=False)
        #self.testingData = all_data.loc[(all_data.SalePrice==0), cols].reset_index(drop=True, inplace=False)

        return data, y_train, cols, colsP
x = pd.DataFrame(np.c_[df['LSTAT'], df['RM']], columns=['LSTAT', 'RM'])
Y = df['MEDV']

from sklearn.model_selection import train_test_split
x_train, x_test, Y_train, Y_test = train_test_split(x,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=5)

#---use a polynomial function of degree 2---
degree = 2  # 2차식
polynomial_features = PolynomialFeatures(degree=degree)
x_train_poly = polynomial_features.fit_transform(x_train)

#---print out the formula---
print(polynomial_features.get_feature_names(['x', 'y']))

model = LinearRegression()
model.fit(x_train_poly, Y_train)

x_test_poly = polynomial_features.fit_transform(x_test)
print('R-Squared: %.4f' % model.score(x_test_poly, Y_test))

print(model.intercept_)
print(model.coef_)

fig = plt.figure(figsize=(18, 15))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x['LSTAT'], x['RM'], Y, c='b')

ax.set_xlabel("LSTAT")
# impute missing values
imputer = SimpleImputer(strategy='median')
poly_target = poly_features['TARGET']
poly_features = poly_features.drop(columns=['TARGET'])
poly_features = imputer.fit_transform(poly_features)
poly_features_test = imputer.transform(poly_features_test)
# create polynomial features
poly_transformer = PolynomialFeatures(degree=3)
poly_transformer.fit(poly_features)
# transform the features
poly_features = poly_transformer.transform(poly_features)
poly_features_test = poly_transformer.transform(poly_features_test)
print('Polynomial Features shape: ', poly_features.shape)

poly_transformer.get_feature_names(input_features=[
    'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'
])[:15]

# create df for features
poly_features = pd.DataFrame(poly_features,
                             columns=poly_transformer.get_feature_names([
                                 'EXT_SOURCE_1', 'EXT_SOURCE_2',
                                 'EXT_SOURCE_3', 'DAYS_BIRTH'
                             ]))
# add in the target
poly_features['TARGET'] = poly_target
# find the correlations with the target
poly_corrs = poly_features.corr()['TARGET'].sort_values()
# display most negative and most positive
print(poly_corrs.head(10))
print(poly_corrs.tail(5))
Exemple #6
0
# ### 1.b Extract polynomial features and interactions up to a degree of 2

X = data.drop('MEDV', axis=1)
Y = data['MEDV']

# #####  By default train_test_split divide the sample assigning 25% for train test group

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, include_bias=False).fit(X)
X_poly = poly.transform(X)
print("X_poly.shape: {}".format(X_poly.shape))

# ##### The polynominal transformation dataset includes 104 features, 13 original features, new 13 squared values of the original and 78 interaction among these variables. The inclusion of these new variables should improve model fitting from a lineal version which do not take into account non-lineal relation among features (explanatory variables) and dependent variable (av. price of Boston houses or MEDV)

print("Polynomial feature names:\n{}".format(poly.get_feature_names()))

# #####  The polynominal transformation dataset includes 105 features, 13 original features, new 13 squared values of the original and 70 interaction among these variables. The inclusion of these new variables should improve model fitting from a lineal version which do not take into account non-lineal relation among features (explanatory variables) and dependent variable (av. price of Boston houses)

# ### 1.c Create a pandas DataFrame using the polynomials and save the file

polynomials0 = pd.DataFrame(
    X_poly,
    columns=[
        'x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10',
        'x11', 'x12', 'x0^2', 'x0 x1', 'x0 x2', 'x0 x3', 'x0 x4', 'x0 x5',
        'x0 x6', 'x0 x7', 'x0 x8', 'x0 x9', 'x0 x10', 'x0 x11', 'x0 x12',
        'x1^2', 'x1 x2', 'x1 x3', 'x1 x4', 'x1 x5', 'x1 x6', 'x1 x7', 'x1 x8',
        'x1 x9', 'x1 x10', 'x1 x11', 'x1 x12', 'x2^2', 'x2 x3', 'x2 x4',
        'x2 x5', 'x2 x6', 'x2 x7', 'x2 x8', 'x2 x9', 'x2 x10', 'x2 x11',
        'x2 x12', 'x3^2', 'x3 x4', 'x3 x5', 'x3 x6', 'x3 x7', 'x3 x8', 'x3 x9',
# [3*x for x in range(max_range)],
# [4*x for x in range(max_range)],
# [5*x for x in range(max_range)]]

X = la.transpose(X)  # print(X)

# Fitting Polynomial Regression to the dataset
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=2)

X_poly_features = poly_reg.fit(X)
# print(X_poly_features)
# print()

# fit or fit_transform must be called before this is called
feature_names = poly_reg.get_feature_names()
feature_names.sort()
print(feature_names)
print()

# X_poly_transform = poly_reg.transform(X)
# print(len(poly_reg.get_feature_names()))
# print(X_poly_transform)
# print()

# print(poly_reg.get_feature_names())
# print()

# print(poly_reg.get_params())
# print()
Exemple #8
0
def feature_adder_poly(df, *cols, degree=2, include_bias=False):
    poly = PolynomialFeatures(degree=degree, include_bias=include_bias)
    poly.fit(df[list(cols)])
    return poly.transform(df[list(cols)])[:, len(cols):], poly.get_feature_names(cols)[len(cols):]
        df_label = df_label[(z < threshold)]

        # Reset the index for the polynomial features merge
        df_label = df_label.reset_index(drop=True)

        # Get polynomial features
        polyTrans = PolynomialFeatures(degree=2, include_bias=False)
        df_label_Num = df_label[[
            "level", "temperature", "usage", "Brightness", "RAM"
        ]]
        df_label = df_label.drop(
            ["level", "temperature", "usage", "Brightness", "RAM"],
            axis=1)  # Drop them to get back later the poly Trans of them

        polyData_Num = polyTrans.fit_transform(df_label_Num)
        columnNames = polyTrans.get_feature_names(
            ["level", "temperature", "usage", "Brightness", "RAM"])
        df_label_Num = pandas.DataFrame(polyData_Num, columns=columnNames)

        for column in columnNames:
            df_label[column] = pandas.Series(df_label_Num[column])

        # Get dataframes
        y_label = df_label["output"]
        X_label = df_label.drop(["output"], axis=1)

        # Keep only the selected columns for each labels
        X_label = X_label[selColumns[idx]]

        # Split data training and testing ...
        X_train_label, X_test_label, y_train_label, y_test_label = train_test_split(
            X_label, y_label, test_size=0.25, random_state=42)
Exemple #10
0
def featureSelection(data, labels):
    plotROCCurveBase()

    crossValidationTestAndPlot(LogisticRegression(),
                               "Full Feature Set",
                               data,
                               labels,
                               cvNum=5,
                               addAverage=True)

    # Create and fit selector
    selector = SelectKBest(k=100)
    selector.fit(data, labels)
    # Get columns to keep
    cols = selector.get_support()
    print(len(cols))
    # Create new dataframe with only desired columns, or overwrite existing
    data = data[data.columns[cols]]
    print(data.shape)

    crossValidationTestAndPlot(LogisticRegression(),
                               "100-Best Features",
                               data,
                               labels,
                               cvNum=5,
                               addAverage=True)

    print(data.columns.values.tolist())

    poly = PolynomialFeatures(interaction_only=True)
    polyData = pd.DataFrame(poly.fit_transform(data),
                            columns=poly.get_feature_names(data.columns))

    crossValidationTestAndPlot(LogisticRegression(),
                               "Features with Interaction",
                               polyData,
                               labels,
                               cvNum=5,
                               addAverage=True)

    #print ("Interaction Features", poly.get_feature_names(data.columns))

    # Create and fit selector
    selector = SelectKBest(k=100)
    selector.fit(polyData, labels)
    # Get columns to keep
    cols = selector.get_support()
    # Create new dataframe with only desired columns, or overwrite existing
    polyData = polyData[polyData.columns[cols]]

    crossValidationTestAndPlot(LogisticRegression(),
                               "100-Best Features With Interaction",
                               polyData,
                               labels,
                               cvNum=5,
                               addAverage=True)
    print(polyData.columns.values.tolist())
    #print(polyData.get_feature_names(data.columns))

    poly3 = PolynomialFeatures(degree=2)
    poly3Data = pd.DataFrame(poly3.fit_transform(data),
                             columns=poly3.get_feature_names(data.columns))
    crossValidationTestAndPlot(LogisticRegression(),
                               "Features with Polynomials up to n^2",
                               poly3Data,
                               labels,
                               cvNum=5,
                               addAverage=True)

    # Create and fit selector
    selector = SelectKBest(k=100)
    selector.fit(poly3Data, labels)
    # Get columns to keep
    cols = selector.get_support()
    # Create new dataframe with only desired columns, or overwrite existing
    poly3Data = poly3Data[poly3Data.columns[cols]]

    crossValidationTestAndPlot(LogisticRegression(),
                               "100-Best Features With Polynomials up to n^2",
                               poly3Data,
                               labels,
                               cvNum=5,
                               addAverage=True)

    plt.legend()
    plt.show()
Exemple #11
0
        train_performances = train_scenario.performance_data
        print(train_performances)
        train_features = train_scenario.feature_data
        # preprocessing
        imputer = SimpleImputer()
        polytransform = PolynomialFeatures(2)
        scaler = StandardScaler()

        # Impute
        train_features[train_features.columns] = imputer.fit_transform(
            train_features[train_features.columns])

        # Create polynomial features
        if use_quadratic_transform:
            quad_data = polytransform.fit_transform(train_features.to_numpy())
            new_cols = polytransform.get_feature_names(train_features.columns)
            train_features = pd.DataFrame(data=quad_data,
                                          index=train_features.index,
                                          columns=new_cols)

        # Standardize
        train_features[train_features.columns] = scaler.fit_transform(
            train_features[train_features.columns])

        # inst, perf, rank = util.construct_numpy_representation_with_pairs_of_rankings(
        #     train_features, train_performances, max_pairs_per_instance=max_pairs_per_instance, seed=seed)

        cutoff = scenario.algorithm_cutoff_time
        par10 = cutoff * 10

        perf = train_performances.to_numpy()
Exemple #12
0
    def test_heat_capacity(self):
        # Size of CMU and
        # % solid
        # Density of concrete in CMU, lb/ft³*
        # heat capacity data

        # The 6.9 point on row 5 is bad data but I am leaving it here because
        # I eliminate it by deleting it. I want the original data from Table
        hc_data = {
            "heat_capacity_IMP": [
                3.40, 3.78, 4.17, 4.55, 4.93, 5.56, 5.96, 4.01, 4.47, 4.94,
                5.40, 5.86, 6.60, 7.08, 5.05, 5.64, 6.23, 6.82, 7.41, 8.37,
                8.99, 4.36, 4.87, 5.37, 5.87, 6.38, 7.19, 7.72, 6.04, 6.76,
                7.47, 8.18, 6.90, 10.05, 10.80, 5.57, 6.23, 6.88, 7.52, 8.17,
                9.21, 9.89, 8.17, 9.14, 10.11, 11.08, 12.04, 13.61, 14.63,
                6.50, 7.25, 8.01, 8.76, 9.51, 10.60, 11.38, 10.26, 11.48,
                12.71, 13.93, 15.15, 17.13, 18.41, 7.75, 8.66, 9.57, 10.48,
                11.39, 12.86, 13.81, 12.30, 13.77, 15.25, 16.37, 18.20, 20.59,
                22.14
            ],
            "percent_solid": [
                65, 65, 65, 65, 65, 65, 65, 78, 78, 78, 78, 78, 78, 78, 100,
                100, 100, 100, 100, 100, 100, 55, 55, 55, 55, 55, 55, 55, 78,
                78, 78, 78, 78, 78, 78, 52, 52, 52, 52, 52, 52, 52, 78, 78, 78,
                78, 78, 78, 78, 48, 48, 48, 48, 48, 48, 48, 78, 78, 78, 78, 78,
                78, 78, 48, 48, 48, 48, 48, 48, 48, 78, 78, 78, 78, 78, 78, 78
            ],
            "thickness_in": [
                4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
                6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8,
                8, 8, 8, 8, 8, 8, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
                10, 10, 10, 10, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
                12, 12
            ],
            "density_IMP": [
                80, 90, 100, 110, 120, 130, 140, 80, 90, 100, 110, 120, 130,
                140, 80, 90, 100, 110, 120, 130, 140, 80, 90, 100, 110, 120,
                130, 140, 80, 90, 100, 110, 120, 130, 140, 80, 90, 100, 110,
                120, 130, 140, 80, 90, 100, 110, 120, 130, 140, 80, 90, 100,
                110, 120, 130, 140, 80, 90, 100, 110, 120, 130, 140, 80, 90,
                100, 110, 120, 130, 140, 80, 90, 100, 110, 120, 130, 140
            ]
        }

        df_IMP = pd.DataFrame(hc_data)
        df = pd.DataFrame({
            "heat capacity":
            self.Btu_per_ft2F_to_J_per_m2K * df_IMP["heat_capacity_IMP"],
            "percent solid":
            df_IMP["percent_solid"],
            "thickness":
            self.in_to_m * df_IMP["thickness_in"],
            "density":
            self.lbpft3_to_kgpm3 * df_IMP["density_IMP"]
        })
        if self.include_plots:
            fig, axl = plt.subplots(3, 1, figsize=(10, 20))
            axl[0].scatter(df["percent solid"],
                           df["heat capacity"],
                           color="red")
            axl[1].scatter(df["thickness"], df["heat capacity"], color="blue")
            axl[2].scatter(df["density"], df["heat capacity"], color="green")

        X = df[["percent solid", "thickness", "density"]]
        Y = df[["heat capacity"]]

        model = sm.OLS(Y, X).fit()
        predictions = model.predict(X)

        print_model = model.summary()

        # from multi-variate linear to multi-variate polynomial fits.
        if self.include_plots:
            fig2, axl = plt.subplots(1, 2, figsize=(20, 10))
        for i in range(5):
            poly = PolynomialFeatures(degree=i + 1)
            X_ = poly.fit_transform(X, y=Y)
            model = sm.OLS(Y, X_).fit()
            predictions = model.predict(X_)
            df['heat capacity fit polynomial order {0:2d}'.format(
                i + 1)] = predictions
            df['heat capacity errors polynomial order {0:2d}'.format(
                i + 1)] = 100 * (predictions -
                                 df["heat capacity"]) / df["heat capacity"]
            if self.include_plots:
                df['heat capacity errors polynomial order {0:2d}'.format(
                    i + 1)].plot(ax=axl[0], label="poly {0:2d}".format(i + 1))
        if self.include_plots:
            axl[0].legend()
            axl[0].set_title("Fits with 1 bad data point")

        # THE np.nan used to be 6.9 but this makes not sense w/r to the the
        # data and the polynomial fits and associated error proved this by being
        # unable to fit the spurious point.
        df.drop(index=32, inplace=True)
        # redo X and Y because we have dropped a point.
        X = df[["percent solid", "thickness", "density"]]
        Y = df[["heat capacity"]]
        if self.include_plots:
            fig3, ax3 = plt.subplots(1, 1)
        # we not stop at order 3 because it has the best combination of error and complexity:
        #    poly  2 error max: 8.279 min: -8.256 average magnitude: 1.888
        #    poly  3 error max: 2.006 min: -1.714 average magnitude: 0.731 X
        #    poly  4 error max: 2.044 min: -1.665 average magnitude: 0.655
        #    poly  5 error max: 1.958 min: -1.561 average magnitude: 0.635
        for i in range(3):
            poly = PolynomialFeatures(degree=i + 1)
            X_ = poly.fit_transform(X, y=Y)
            model = sm.OLS(Y, X_).fit()
            predictions = model.predict(X_)
            df['heat capacity fit polynomial order {0:2d}'.format(
                i + 1)] = predictions
            err = 100 * (predictions -
                         df["heat capacity"]) / df["heat capacity"]
            df['heat capacity errors polynomial order {0:2d}'.format(i +
                                                                     1)] = err
            if self.include_plots:
                df['heat capacity errors polynomial order {0:2d}'.format(
                    i + 1)].plot(ax=axl[1], label="poly {0:2d}".format(i + 1))
                print(
                    "poly {0:2d} error max: {1:5.3f} min: {2:5.3f} average magnitude: {3:5.3f}"
                    .format(i + 1, np.max(np.max(err.values)),
                            np.min(err.values), np.mean(abs(err.values))))

        if self.include_plots:
            axl[1].legend()
            axl[1].set_title("Fits bad data removed")
            model.summary()
            print("The model parameters are:")
            params = model.params
            for param in params:
                print("{0:12.8e}".format(param))

            print(poly.get_feature_names())

        # now test the implementation in ElCanoBuildingEnergy_Demand_Load_Model.py.concrete_wall
        HC_val1 = 19.312 / self.Btu_per_hft2F_to_W_per_m2K * self.Btu_per_ft2F_to_J_per_m2K
        HC_val2 = 42.7136 / self.Btu_per_hft2F_to_W_per_m2K * self.Btu_per_ft2F_to_J_per_m2K
        wall1 = ec_be.concrete_wall(1280, 0.35, 0.1016, 0.0)
        wall2 = ec_be.concrete_wall(1760, (100 - 52) / 100, 0.2032, 0.0)

        err1 = np.abs(100 * (wall1.HC_value - HC_val1) / HC_val1)
        err2 = np.abs(100 * (wall2.HC_value - HC_val2) / HC_val2)

        self.assertTrue(err1 < 2.006)
        self.assertTrue(err2 < 2.006)
def task_2b():
    # read csv files as dataframe
    life_df = pd.read_csv("life.csv")
    world_df = pd.read_csv("world.csv")
    world_df = world_df.rename(columns={
        'Country Name': 'Country',
        'Time': 'Year'
    })

    # PREPROCESSING - from Question 2A
    # merge dataframes on common columns (country and country code)
    world_df = world_df.rename(columns={
        'Country Name': 'Country',
        'Time': 'Year'
    })
    new_df = pd.merge(life_df,
                      world_df,
                      how='inner',
                      on=['Country', 'Country Code'])
    # remove rows with null 'life expectancy' values
    new_df = new_df.dropna(axis=0, subset=['Life expectancy at birth (years)'])
    # split into training and test sets with random state of 100
    X1 = new_df.iloc[:, 5:]  # learn from these data
    X2 = new_df.iloc[:, 1]
    # only keep Country Code with 20 original features, create a pointer to this df for reference to country split later
    X = pd.concat([X2.reset_index(drop=True),
                   X1.reset_index(drop=True)],
                  axis=1)
    y = new_df.loc[:, 'Life expectancy at birth (years)']  # expected results
    X_train_with_country, X_test_with_country, y_train, y_test = ms.train_test_split(
        X, y, train_size=2 / 3, test_size=1 / 3, random_state=100)
    X_train_with_country_df = pd.DataFrame(
        X_train_with_country,
        index=X_train_with_country.index,
        columns=X_train_with_country.columns)
    X_test_with_country_df = pd.DataFrame(X_test_with_country,
                                          index=X_test_with_country.index,
                                          columns=X_test_with_country)
    # reassign pointers to purely quantitative features in X_train and X_test
    X_train = X_train_with_country.iloc[:, 1:]
    X_test = X_test_with_country.iloc[:, 1:]

    # turn strings from X_train and X_test to NaN (inputs)
    for column in X_train.columns:
        X_train[column] = pd.to_numeric(X_train[column], errors='coerce')
    for column in X_test.columns:
        X_test[column] = pd.to_numeric(X_test[column], errors='coerce')
    # fill the NaN values in X_test and X_train with median of X_train
    for col in X_train.select_dtypes(include=np.number):
        X_train[col] = X_train[col].fillna(X_train[col].median())
    for col in X_test.select_dtypes(include=np.number):
        X_test[col] = X_test[col].fillna(X_train[col].median())
    # scale training set and test set
    scaler = preprocessing.StandardScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    scaled_X_train = pd.DataFrame(scaled_X_train,
                                  index=X_train.index,
                                  columns=X_train.columns)
    scaled_X_test = pd.DataFrame(scaled_X_test,
                                 index=X_test.index,
                                 columns=X_test.columns)

    # PART 1: FEATURE ENGINEERING

    # INTERACTION TERM PAIRS
    print(DIVIDER + "INTERACTION TERM PAIRS" + DIVIDER)
    # degree of terms = 2
    # interaction_only means no feature multiplied by itself (x^2) only x*y
    # include_bias means including constant terms that act as intercept in linear model --> false
    poly = PolynomialFeatures(degree=2,
                              interaction_only=True,
                              include_bias=False)
    # numpy array of world.csv with 210 features
    world_int_term_pairs = poly.fit_transform(scaled_X_train)
    int_pair_list = poly.get_feature_names(scaled_X_train.columns.tolist())
    int_term_pairs = pd.DataFrame({'features': int_pair_list})
    world_int_term_pairs_df = pd.DataFrame(data=world_int_term_pairs[:],
                                           columns=int_pair_list)
    print("Number of interaction term pairs:", len(int_term_pairs))
    print(int_term_pairs)
    print("\n")

    # CLUSTERING LABELS
    print(DIVIDER + "CLUSTERING LABELS" + DIVIDER)
    # use elbow method to determine suitable k value
    # within-cluster-sum-of-squares (WCSS)
    wcss = []
    # this loop will fit the k-means algorithm to data and compute the WCSS and append to list
    for i in range(1, 20):
        kmeans = KMeans(n_clusters=i, init='k-means++')
        kmeans.fit(scaled_X_train)
        # kmeans inertia attribute is sum of squared distance of samples to their closest cluster center
        wcss.append(kmeans.inertia_)

    # plot the elbow graph - choose n=5 for clustering as this is point where graph plateaus
    plt.figure(figsize=(12, 6))
    plt.plot(range(1, 20), wcss, marker='o')
    plt.title('Elbow Method Graph for Selection of Appropriate k Value')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
    plt.savefig('task2bElbowGraph.png', bbox_inches='tight')
    plt.show()
    plt.close()

    # form clusters
    kmeans = KMeans(
        n_clusters=7
    )  # any number between 5-7 should be good, but higher = not much difference
    clusters = kmeans.fit(scaled_X_train)
    prediction = kmeans.predict(scaled_X_train)

    # see count of data points in each cluster
    frame = pd.DataFrame(scaled_X_train)
    frame['k-means cluster'] = prediction
    counts_series = frame['k-means cluster'].value_counts().sort_index()
    counts_df = pd.DataFrame({
        'cluster number': counts_series.index,
        'counts': counts_series.values
    })
    print("\nCount of countries in each cluster:")
    print(counts_df.to_string(index=False))
    # see which cluster each country from training set is in (should be a number between 0-6 because 7 clusters))
    print("\nCountries from training set with new feature (f-clusterlabels):")
    cluster_label_df = pd.DataFrame({
        'Country Code':
        X_train_with_country_df.iloc[:, 0],
        'f-clusterlabel':
        clusters.labels_
    })
    print("\n", cluster_label_df.to_string(index=False))

    # Turn the 20 features from world.csv into 2 dimensions with PCA
    pca_2 = PCA(n_components=2)
    plot_columns = pca_2.fit_transform(scaled_X_train.iloc[:, :21])
    # Plot each cluster and shade by their cluster label
    plt.scatter(x=plot_columns[:, 0],
                y=plot_columns[:, 1],
                c=scaled_X_train["k-means cluster"])
    plt.title('K-Means Clustering with 7 Clusters and 2 Principal Components')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.savefig('task2bKMeansClustering.png', bbox_inches='tight')
    plt.show()
    plt.close()

    # PART 2: FEATURE SELECTION

    # SELECT 4 FEATURES IN A PRINCIPLED MANNER
    print("\n" + DIVIDER + "SELECTING 4 FEATURES IN A PRINCIPLED MANNER" +
          DIVIDER)
    model = ExtraTreesClassifier(random_state=200)
    model.fit(scaled_X_train, y_train)
    #plot graph of top 10 most important features for better visualization
    feat_importances = pd.Series(model.feature_importances_,
                                 index=scaled_X_train.columns)
    feat_importances.nlargest(10).plot(kind='barh')
    plt.title('Top 10 Most Important Features to Life Expectancy')
    plt.xlabel('Importance Score with Extra-Trees Classifier')
    plt.ylabel('Feature Name')
    plt.savefig('task2bTop10ImportantFeatures.png', bbox_inches='tight')
    plt.show()
    plt.close()

    print(
        "The 4 chosen features and their relative importance (descending order) to predicting Life Expectancy are:"
    )
    feat_importance_dict = feat_importances.nlargest(4).to_dict()
    # sort the dictionary in descending order of importance
    {
        key: value
        for key, value in sorted(feat_importance_dict.items(),
                                 key=lambda item: item[1],
                                 reverse=True)
    }
    dct_for_knn = {}
    dct_for_knn_test = {}
    i = 1
    for key, item in feat_importance_dict.items():
        dct_for_knn[key] = scaled_X_train[key]
        dct_for_knn_test[key] = scaled_X_test[key]
        print("Chosen feature number " + str(i) + ": " + key +
              " with an importance of " + str(round(item, 3)))
        i += 1
    principled_df = pd.DataFrame(dct_for_knn)
    principled_df_test = pd.DataFrame(dct_for_knn_test)

    # PCA
    print("\n" + DIVIDER + "PRINCIPAL COMPONENT ANALYSIS" + DIVIDER)
    pca = PCA(n_components=4)
    scaled_X_train = scaled_X_train.drop('k-means cluster', 1)
    pca.fit(scaled_X_train)
    principal_components = pca.transform(scaled_X_train)
    principal_df = pd.DataFrame(data=principal_components,
                                columns=['PC-1', 'PC-2', 'PC-3', 'PC-4'])
    principal_components_test = pca.transform(scaled_X_test)
    principal_test = pd.DataFrame(data=principal_components_test,
                                  columns=['PC-1', 'PC-2', 'PC-3', 'PC-4'])
    print("Reduced training set data from " +
          str(len(scaled_X_train.columns)) + " features to " +
          str(len(principal_df.columns)) + " Principal Components:")
    print(principal_df)

    # FIRST 4 FEATURES
    print("\n" + DIVIDER + "FIRST 4 FEATURES" + DIVIDER)
    first_4_features = scaled_X_train.iloc[:, :4]
    first_4_features_test = scaled_X_test.iloc[:, :4]
    print("The first 4 features of the original dataset are:")
    i = 1
    for column in first_4_features.columns:
        print(str(i) + "." + " " + column)
        i += 1

    # PART 3: PERFORM 5-NN CLASSIFICATION USING FEATURES SELECTED ABOVE
    print(
        "\n==========ACCURACY OF EACH FEATURE GROUP IN 5-NN CLASSIFICATION=========="
    )
    k5_classifier = KNeighborsClassifier(n_neighbors=5)

    # 5-NN with 4 features selected in principled manner
    k5_classifier.fit(principled_df, y_train)
    k5_test_accu_principled = k5_classifier.score(principled_df_test, y_test)

    # 5-NN with 4 features from PCA
    k5_classifier.fit(principal_df, y_train)
    k5_test_accu_PCA = k5_classifier.score(principal_test, y_test)

    # 5-NN with first 4 features
    k5_classifier.fit(first_4_features, y_train)
    k5_test_accu_first_4 = k5_classifier.score(first_4_features_test, y_test)
    print('Accuracy of feature engineering: {:.{width}f}'.format(
        k5_test_accu_principled, width=3))
    print('Accuracy of PCA: {:.{width}f}'.format(k5_test_accu_PCA, width=3))
    print('Accuracy of first four features: {:.{width}f}'.format(
        k5_test_accu_first_4, width=3))
Exemple #14
0
# ================== Fit a polynomial regression model =========================

# Load the library required for feature engineering
from sklearn.preprocessing import PolynomialFeatures

# Extract the predictor from the dataframe df
X = df.iloc[:, 0:1].values

# Calculate the MSE with a polynomial with varying degrees
degrees = [2, 3, 4, 5, 6, 7, 8, 9]
mse = []
for degree in degrees:
    poly = PolynomialFeatures(degree, include_bias=False)
    X_poly = poly.fit_transform(X)
    X_poly_feature_name = poly.get_feature_names(
        ['Feature' + str(l) for l in range(1, 6)])
    df_poly = pd.DataFrame(X_poly, columns=X_poly_feature_name)
    df_poly['y'] = df['Y']
    X_train = df_poly.drop('y', axis=1)
    y_train = df_poly['y']
    poly = LinearRegression(normalize=True)
    model_poly = poly.fit(X_train, y_train)
    y_poly = poly.predict(X_train)
    mse.append(mean_squared_error(y_poly, y_train))

# Analyze the MSE with a polynomial with varying degrees
plt.figure(figsize=(12, 8))
plt.xlabel("Degrees", fontsize=20)
plt.ylabel("Mean-squared Eror", fontsize=20)
plt.grid(1)
plt.scatter(degrees, mse, edgecolors=(0, 0, 0), lw=2, s=80)
Exemple #15
0
# Need to impute missing values
poly_features = imputer.fit_transform(poly_features)
poly_features_test = imputer.transform(poly_features_test)

poly_transformer = PolynomialFeatures(degree=2)

poly_transformer.fit(poly_features)

# Transform the features
poly_features = poly_transformer.transform(poly_features)
poly_features_test = poly_transformer.transform(poly_features_test)
print('Polynomial Features shape: ', poly_features.shape)

poly_features = pd.DataFrame(poly_features,
                             columns=poly_transformer.get_feature_names(
                                 ['板温', '现场温度', '光照强度', '风速', '风向']))

# Add in the target
poly_features['TARGET'] = poly_target

# Find the correlations with the target
poly_corrs = poly_features.corr()['TARGET'].sort_values()

# Display most negative and most positive
# print(poly_corrs)
# Put test features into dataframe
poly_features_test = pd.DataFrame(poly_features_test,
                                  columns=poly_transformer.get_feature_names(
                                      ['板温', '现场温度', '光照强度', '风速', '风向']))
''''''
''''''
scores_lr = []
errors_lr = []
print('[Linear Regression] running 10-fold cross-validation')
for train_indices, val_indices in kf.split(
        X, Y):  #split into training and test set

    X_train = X[train_indices]
    X_test = X[val_indices]
    Y_train = Y[train_indices]
    Y_test = Y[val_indices]

    #transform to format f(x,theta) = theta0 + theta1*x1 + theta2*x2 + theta3*x1*x2
    p = PolynomialFeatures(interaction_only=True, include_bias=False, degree=2)
    X_train = p.fit_transform(X_train)
    X_test = p.fit_transform(X_test)
    poly_feature_names = p.get_feature_names()

    #train model
    lr.fit(X_train, Y_train)

    #model parameters
    beta_est = [lr.coef_[0], lr.intercept_]
    intercept = lr.intercept_
    coefficients = lr.coef_  #theta values
    assert (len(coefficients) == len(poly_feature_names))

    #predict values for testing set
    Y_est = lr.predict(X_test)

    #calculate Mean Square error and accuracy
    MSE = evaluate_predictions(y_true=Y_test, y_pred=Y_est)
Exemple #17
0
Pred1 = np.exp(Pred1)
Pred1[Pred1 > df_train['price'].max()] = df_train['price'].max()
Pred1[Pred1 < df_train['price'].min()] = df_train['price'].min()
print('Linear')
print('RMSE is:', round(np.sqrt(mse(y, Pred1)), 2))
print('R2 is:', round(r2_score(y, Pred1), 2))

plt.plot(y, Pred1, '.')
plt.scatter(y, Pred1)
plt.show()

##############################
#poly
poly = PolynomialFeatures(2)
X_train2 = poly.fit_transform(X_train1)
polynames = poly.get_feature_names(X_train1.columns)
X_train2 = pd.DataFrame(X_train2, columns=polynames)

reg2 = lm.LinearRegression()
y_log = np.log(y)
reg2.fit(X_train2, y_log)

Pred2 = reg2.predict(X_train2)
Pred2 = np.exp(Pred2)
Pred2[Pred2 > df_train['price'].max()] = df_train['price'].max()
Pred2[Pred2 < df_train['price'].min()] = df_train['price'].min()
print('Polynomial fit')
print('RMSE is:', round(np.sqrt(mse(y, Pred2)), 2))
print('R2 is:', round(r2_score(y, Pred2), 2))

plt.plot(y, Pred2, '.')
plt.xlabel("Univariate Regression Coefficients")
plt.ylabel("Multiple Regression Coefficients")
plt.grid()
plt.show()

# ## (f) Non-linear association between any of the predictors and the response.

# Model of the form y = &beta;<sub>0</sub> + &beta;<sub>1</sub>X + &beta;<sub>2</sub>X<sup>2</sup> + &beta;<sub>3</sub>X<sup>3</sup> + &epsilon;

beta_nl = []
poly = PolynomialFeatures(3)
for i in df_X.columns:
    print("Predictor-", i)
    col = df[i].values.reshape(-1, 1)
    X_poly = poly.fit_transform(col)
    df_poly = pd.DataFrame(X_poly, columns=poly.get_feature_names(i))

    print("Dataframe with non-linear terms-")
    print(df_poly.head())
    model = sm.OLS(y, df_poly).fit()
    predictions_nl = model.predict(
        df_poly)  # make the predictions by the model

    # Print out the statistics
    stats_nl = model.summary().tables[1]
    print(stats_nl)
    print("The accuracy is {}%".format((model.rsquared) * 100))
    beta_nl.append(model.params[1])
    print("")
    print("")
Exemple #19
0
def poly(df, col_names):
    data = df[col_names]
    p = PolynomialFeatures(2).fit(data)
    features = pd.DataFrame(p.transform(data),
                            columns=p.get_feature_names(data.columns))
    return features
 #                          led_current="25 mA")
 # print(x.shape)
 # # pls_screen_as726x(x, y, n_comps=10)
 # print(type(x))
 poly = PolynomialFeatures(degree=1)
 x_trans = poly.fit_transform(x)
 # pls.fit(x_trans, y)
 # y_predict = pls.predict(x_trans)
 # print(mean_absolute_error(y, y_predict))
 # ham
 # n_comps = 6
 # regr = PLSRegression(n_components=n_comps)
 # print(x_trans.shape)
 # print(poly.get_feature_names())
 #
 x_trans = pd.DataFrame(x_trans, columns=poly.get_feature_names())
 print(x_trans)
 cols_to_use = []
 for column in poly.get_feature_names():
     if ' ' not in column:
         cols_to_use.append(column)
 print(cols_to_use)
 x_trans = x_trans[cols_to_use]
 print(x_trans)
 # svr = SVR()
 # pls = PLSRegression(n_components=6)
 # regr = pls
 # print(y.columns)
 # # pls.fit(x, y['Avg Total Chlorophyll (µg/cm2)'])
 # # print(pls.coef_)
 # # plot_learning_curve(pls, "", x_trans, y['Avg Total Chlorophyll (µg/cm2)'])
Exemple #21
0
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

from math import sqrt

import pandas as pd
import numpy as np

File_Path = '../datasets/admission_data.csv'
admission_df = pd.read_csv(File_Path).drop('Serial No.', axis=1)

admission_df.haed()

polynomial_transformer = PolynomialFeatures(6)
polynomial_features = polynomial_transformer.fit_transform(admission_df.values)
features = polynomial_transformer.get_feature_names(admission_df.columns)

label = admission_df[['Chance of Admit ']]

train_data, test_data, train_label, test_label = train_test_split(admission_df, label, test_size=0.3, random_state=5)

model = Lasso(alpha=0.001, max_iter=1000, normalize=True)
"""
alpha : 정규화 식에서의 람다 값 지정.
max_iter : 경사 하강법 반복 횟수 지정.
normalize : True로 지정 시, 자동으로 Feature Scaling 적용

이는 L2 정규화 모델 Ridge 에서도 동일하게 적용된다.
"""
model.fit(train_data, train_label)
Exemple #22
0
plt.vlines(kb.bin_edges_[0], -3, 3, linewidth=1, alpha=.2)
plt.plot(X[:, 0], y, 'o', c='k')
plt.ylabel("Regression output")
plt.xlabel("Input feature")
plt.legend(loc="best")

from sklearn.preprocessing import PolynomialFeatures
# include polynomials up to x ** 10:
# the default "include_bias=True" adds a feature that's constantly 1
poly = PolynomialFeatures(degree=10, include_bias=False)
poly.fit(X)
X_poly = poly.transform(X)
print("X_poly.shape: {}".format(X_poly.shape))
print("Entries of X:\n{}".format(X[:5]))
print("Entries of X_poly:\n{}".format(X_poly[:5]))
print("Polynomial feature names:\n{}".format(poly.get_feature_names()))

reg = LinearRegression().fit(X_poly, y)
line_poly = poly.transform(line)
plt.plot(line, reg.predict(line_poly), label='polynomial linear regression')
plt.plot(X[:, 0], y, 'o', c='k')
plt.ylabel("Regression output")
plt.xlabel("Input feature")
plt.legend(loc="best")

from sklearn.svm import SVR
for gamma in [1, 10]:
    svr = SVR(gamma=gamma).fit(X, y)
    plt.plot(line, svr.predict(line), label='SVR gamma={}'.format(gamma))
plt.plot(X[:, 0], y, 'o', c='k')
plt.ylabel("Regression output")
insurance_df = pd.read_csv(INSURANCE_FILE_PATH)

# 필요한 열들에 One-hot Encoding
insurance_df = pd.get_dummies(data=insurance_df,
                              columns=['sex', 'smoker', 'region'])

# 입력 변수 데이터
X = insurance_df.drop(['charges'], axis=1)

# 다항 함수 만들기
polynomial_transformer = PolynomialFeatures(4)  # 4 차항 변형기 정의
polynomial_features = polynomial_transformer.fit_transform(
    X.values)  # 4차 항 변수로 변환

# 새로운 변수 이름 생성
features = polynomial_transformer.get_feature_names(X.columns)

# 다항 입력 변수
X = pd.DataFrame(polynomial_features, columns=features)

# 목표 변수
y = insurance_df[['charges']]

# 학습 데이터와 평가 데이터 나누기
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=5)

# Lasso 모델 학습시키기
model = Lasso(alpha=1, max_iter=2000, normalize=True)
Exemple #24
0
    def process_data(self):
        # Training data
        app_train = pd.read_csv('./Downloads/application_train.csv')
        # Testing data features
        app_test = pd.read_csv('./Downloads/application_test.csv')
        # Number of unique classes in each object column
        app_train.select_dtypes('object').apply(pd.Series.nunique, axis=0)

        # Create a label encoder object
        le = LabelEncoder()
        le_count = 0

        # Iterate through the columns
        for col in app_train:
            if app_train[col].dtype == 'object':
                # If 2 or fewer unique categories
                if len(list(app_train[col].unique())) <= 2:
                    # Train on the training data
                    le.fit(app_train[col])
                    # Transform both training and testing data
                    app_train[col] = le.transform(app_train[col])
                    app_test[col] = le.transform(app_test[col])

                    # Keep track of how many columns were label encoded
                    le_count += 1

        print('%d columns were label encoded.' % le_count)
        '''
        离散特征的编码分为两种情况:

        1、离散特征的取值之间没有大小的意义,比如color:[red,blue],那么就使用one-hot编码

        2、离散特征的取值有大小的意义,比如size:[X,XL,XXL],那么就使用数值的映射{X:1,XL:2,XXL:3}

        使用get_dummies可以很方便的对离散型特征进行one-hot编码

        '''

        # one-hot encoding of categorical variables
        app_train = pd.get_dummies(app_train)
        app_test = pd.get_dummies(app_test)

        print('Training Features shape: ', app_train.shape)
        print('Testing Features shape: ', app_test.shape)

        # ### Aligning Training and Testing Data

        train_labels = app_train['TARGET']

        # Align the training and testing data, keep only columns present in both dataframes
        app_train, app_test = app_train.align(app_test, join='inner', axis=1)

        # Add the target back in
        app_train['TARGET'] = train_labels

        print('Training Features shape: ', app_train.shape)
        print('Testing Features shape: ', app_test.shape)

        anom = app_train[app_train['DAYS_EMPLOYED'] == 365243]
        non_anom = app_train[app_train['DAYS_EMPLOYED'] != 365243]
        print('The non-anomalies default on %0.2f%% of loans' %
              (100 * non_anom['TARGET'].mean()))
        print('The anomalies default on %0.2f%% of loans' %
              (100 * anom['TARGET'].mean()))
        print('There are %d anomalous days of employment' % len(anom))

        # Create an anomalous flag column
        app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243

        # Replace the anomalous values with nan
        app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace=True)

        app_train['DAYS_EMPLOYED'].plot.hist(title='Days Employment Histogram')

        app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
        app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace=True)

        print('There are %d anomalies in the test data out of %d entries' %
              (app_test["DAYS_EMPLOYED_ANOM"].sum(), len(app_test)))

        correlations = app_train.corr()['TARGET'].sort_values()

        # Display correlations
        print('Most Positive Correlations:\n', correlations.tail(15))
        print('\nMost Negative Correlations:\n', correlations.head(15))

        # Find the correlation of the positive days since birth and target
        app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])

        app_train.to_csv('./data/app_train.csv', index=False)
        app_test.to_csv('./data/app_test.csv', index=False)

        # Make a new dataframe for polynomial features
        poly_features = app_train[[
            'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH',
            'TARGET'
        ]]
        poly_features_test = app_test[[
            'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH'
        ]]

        imputer = Imputer(strategy='median')

        poly_target = poly_features['TARGET']

        poly_features = poly_features.drop(columns=['TARGET'])

        # Need to impute missing values
        poly_features = imputer.fit_transform(poly_features)
        poly_features_test = imputer.transform(poly_features_test)

        # Create the polynomial object with specified degree
        poly_transformer = PolynomialFeatures(degree=3)

        # Train the polynomial features
        poly_transformer.fit(poly_features)

        # Transform the features
        poly_features = poly_transformer.transform(poly_features)
        poly_features_test = poly_transformer.transform(poly_features_test)
        print('Polynomial Features shape: ', poly_features.shape)

        poly_features = pd.DataFrame(
            poly_features,
            columns=poly_transformer.get_feature_names(
                ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
                 'DAYS_BIRTH']))

        # Add in the target
        poly_features['TARGET'] = poly_target

        # Put test features into dataframe
        poly_features_test = pd.DataFrame(
            poly_features_test,
            columns=poly_transformer.get_feature_names(
                ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
                 'DAYS_BIRTH']))

        poly_features.to_csv('./data/poly_features.csv', index=False)
        poly_features_test.to_csv('./data/poly_features_test.csv', index=False)
        # Merge polynomial features into training dataframe
        poly_features['SK_ID_CURR'] = app_train['SK_ID_CURR']
        app_train_poly = app_train.merge(poly_features,
                                         on='SK_ID_CURR',
                                         how='left')

        # Merge polnomial features into testing dataframe
        poly_features_test['SK_ID_CURR'] = app_test['SK_ID_CURR']
        app_test_poly = app_test.merge(poly_features_test,
                                       on='SK_ID_CURR',
                                       how='left')

        # Align the dataframes
        app_train_poly, app_test_poly = app_train_poly.align(app_test_poly,
                                                             join='inner',
                                                             axis=1)

        app_train_poly['TARGET'] = poly_target
        # Print out the new shapes
        print('Training data with polynomial features shape: ',
              app_train_poly.shape)
        print('Testing data with polynomial features shape:  ',
              app_test_poly.shape)

        app_train_poly.to_csv('./data/app_train_poly.csv', index=False)
        app_test_poly.to_csv('./data/app_test_poly.csv', index=False)

        app_train_domain = app_train.copy()
        app_test_domain = app_test.copy()

        app_train_domain['CREDIT_INCOME_PERCENT'] = app_train_domain[
            'AMT_CREDIT'] / app_train_domain['AMT_INCOME_TOTAL']
        app_train_domain['ANNUITY_INCOME_PERCENT'] = app_train_domain[
            'AMT_ANNUITY'] / app_train_domain['AMT_INCOME_TOTAL']
        app_train_domain['CREDIT_TERM'] = app_train_domain[
            'AMT_ANNUITY'] / app_train_domain['AMT_CREDIT']
        app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain[
            'DAYS_EMPLOYED'] / app_train_domain['DAYS_BIRTH']

        app_test_domain['CREDIT_INCOME_PERCENT'] = app_test_domain[
            'AMT_CREDIT'] / app_test_domain['AMT_INCOME_TOTAL']
        app_test_domain['ANNUITY_INCOME_PERCENT'] = app_test_domain[
            'AMT_ANNUITY'] / app_test_domain['AMT_INCOME_TOTAL']
        app_test_domain['CREDIT_TERM'] = app_test_domain[
            'AMT_ANNUITY'] / app_test_domain['AMT_CREDIT']
        app_test_domain['DAYS_EMPLOYED_PERCENT'] = app_test_domain[
            'DAYS_EMPLOYED'] / app_test_domain['DAYS_BIRTH']

        app_train_domain.to_csv('./data/app_train_domain.csv', index=False)
        app_test_domain.to_csv('./data/app_test_domain.csv', index=False)
Exemple #25
0
print(train.describe())
train = train.drop(['TenantHasSubscription'], axis=1)
test = test.drop(['TenantHasSubscription'], axis=1)
colnames = train.columns.values
train.describe().to_csv(data_dir + 'data-description.csv')
print('going through the columns to find out if they have missing value:')
for n in colnames:
    if any(pd.isna(train[n])):
        print(n)

# adding interaction terms:
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
train_transformed = poly.fit_transform(train[colnames[1:-1]])
print('Shape of train after adding interactions= ', train_transformed.shape)

feat_names = poly.get_feature_names(colnames[1:-1])
feat_names = [
    '-'.join(name.split()) if len(name.split()) > 1 else name
    for name in feat_names
]

train_transformed = pd.DataFrame(train_transformed, columns=feat_names)
train_transformed['OMSTenantId'] = train['OMSTenantId']
train_transformed['Label'] = train['Label']
train_transformed = train_transformed[['OMSTenantId'] + feat_names + ['Label']]

poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
test_transformed = poly.fit_transform(test[colnames[1:-1]])
print('Shape of test after adding interactions= ', test_transformed.shape)

feat_names = poly.get_feature_names(colnames[1:-1])
Exemple #26
0
def clean_data(dataframe):

    survival = dataframe.pop('Survived')  # Pop survived column for now.
    dataframe['Sex'] = dataframe['Sex'].map({
        'female': 1,
        'male': 0
    })  # Turn gender into binary value.
    temp_embarked = pd.get_dummies(
        dataframe['Embarked']
    )  # Split port of embarkation into different feature sets.
    dataframe = pd.concat([dataframe, temp_embarked], axis=1)

    # Replace NaN age values with avg age of their Pclass.
    p1_avg = dataframe[dataframe['Pclass'] == 1]['Age'].mean()
    p2_avg = dataframe[dataframe['Pclass'] == 2]['Age'].mean()
    p3_avg = dataframe[dataframe['Pclass'] == 3]['Age'].mean()
    dataframe.loc[dataframe['Pclass'] == 1,
                  'Age'] = dataframe.loc[dataframe['Pclass'] == 1,
                                         'Age'].fillna(p1_avg)
    dataframe.loc[dataframe['Pclass'] == 2,
                  'Age'] = dataframe.loc[dataframe['Pclass'] == 2,
                                         'Age'].fillna(p2_avg)
    dataframe.loc[dataframe['Pclass'] == 3,
                  'Age'] = dataframe.loc[dataframe['Pclass'] == 3,
                                         'Age'].fillna(p3_avg)

    # Add adult/child data.
    dataframe['Adult'] = [1 if age >= 18 else 0 for age in dataframe['Age']]
    dataframe['Old'] = [
        1 if age >= (dataframe['Age'].mean()) else 0
        for age in dataframe['Age']
    ]

    # Fill in missing Fare values w/ column mean, get FarePerPerson.
    dataframe.loc[dataframe['Fare'].isnull(),
                  'Fare'] = dataframe.loc[dataframe['Fare'].isnull(),
                                          'Fare'].fillna(
                                              dataframe['Fare'].mean())
    dataframe['FarePerPerson'] = dataframe['Fare'] / (dataframe['SibSp'] +
                                                      dataframe['Parch'] + 1)

    # Get titles from Name field.
    unique_titles = list(
        set(
            re.search('^[^,]+, ([\w\s]+)\.', name).group(1)
            for name in dataframe['Name']))
    unique_titles.sort()
    title_dict = {}
    for i, name in enumerate(unique_titles):
        title_dict[name] = i
    dataframe['Title'] = [
        title_dict[re.search('^[^,]+, ([\w\s]+)\.', name).group(1)]
        for name in dataframe['Name']
    ]

    # Get surnames from Name field, exclude adult men.
    all_surnames = [name.split(',')[0] for name in dataframe['Name']]
    unique_surnames = list(set(all_surnames))
    unique_surnames.sort()
    surname_dict = {}
    for i, name in enumerate(unique_surnames):
        surname_dict[name] = [i + 1, 0]
    dataframe['SurnameWomenChildren'] = [
        surname_dict[name][0] for name in all_surnames
    ]
    dataframe.loc[(dataframe['Adult'] == 1) & (dataframe['Sex'] == 0),
                  'SurnameWomenChildren'] = 0

    # Identify single passengers.
    for name in all_surnames:
        surname_dict[name] = [surname_dict[name][0], surname_dict[name][1] + 1]
    single_dict = {}
    for name in surname_dict:
        if surname_dict[name][1] == 1:
            single_dict[name] = 1
        else:
            single_dict[name] = 0
    dataframe['Single'] = [single_dict[name] for name in all_surnames]

    # Add explicit 1/0 to average family survival for women/child groups.
    dataframe['WomenChildGroupSurvival'] = -3
    dataframe['Surname'] = all_surnames
    dataframe = pd.concat([survival, dataframe], axis=1)
    for name in unique_surnames:
        avg_survival = dataframe.loc[(dataframe['Surname'] == name)
                                     & ~((dataframe['Adult'] == 1) &
                                         (dataframe['Sex'] == 0)),
                                     'Survived'].dropna().mean()
        if pd.isnull(avg_survival):
            avg_survival = 0.0
        dataframe.loc[(dataframe['Surname'] == name)
                      & ~((dataframe['Adult'] == 1) & (dataframe['Sex'] == 0)),
                      'WomenChildGroupSurvival'] = avg_survival
    dataframe.loc[dataframe['Single'] == 1,
                  'WomenChildGroupSurvival'] = -1  # Single passengers.
    dataframe.loc[(dataframe['Adult'] == 1) & (dataframe['Sex'] == 0),
                  'WomenChildGroupSurvival'] = -2  # Adult men.
    dataframe = dataframe.drop(['Survived', 'Surname'], axis=1)

    # Cabin data.
    # dataframe['Cabin'] = [1 if len(name)>0 else 0 for name in dataframe['Cabin'].fillna('')]
    # dataframe['Cabin'] = [(ord(c[0].lower()) - 96) for c in dataframe['Cabin'].fillna('U')]

    # Remove data items from frame that are not numerical.
    dataframe = dataframe.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)

    # Add polynomial features.
    pass_id = dataframe.pop('PassengerId')
    poly = PolynomialFeatures(2)
    temp = poly.fit_transform(dataframe)
    poly_header = poly.get_feature_names(dataframe.columns)
    dataframe = pd.DataFrame(data=temp,
                             index=dataframe.index,
                             columns=poly_header)
    dataframe = pd.concat([pass_id, dataframe], axis=1)

    # Perform feature scaling.
    scaler = StandardScaler()
    pass_id = dataframe.pop('PassengerId')
    dataframe[dataframe.columns] = scaler.fit_transform(
        dataframe[dataframe.columns])
    dataframe = pd.concat([pass_id, dataframe], axis=1)

    # Drop features that have the same value for all rows.
    cols = list(dataframe)
    nunique = dataframe.apply(pd.Series.nunique)
    cols_to_drop = nunique[nunique == 1].index
    dataframe = dataframe.drop(cols_to_drop, axis=1)

    # Add survival back in.
    dataframe = pd.concat([survival, dataframe], axis=1)

    # Return data.
    return (dataframe)
Exemple #27
0
# Section 2: Get fake data in correct format
X = la.transpose([X])
Y = la.transpose([Y])

# Section 3: Pure Python Tools Fit
poly_pp = ml.Poly_Features_Pure_Py(order=2)
Xp = poly_pp.fit_transform(X)
print('PP Feature Names:', poly_pp.get_feature_names())
ls_pp = ml.Least_Squares(tol=2, add_ones_column=False)
ls_pp.fit(Xp, Y)
print()

# Section 4: SciKit Learn Fit
poly_sk = PolynomialFeatures(degree=2)
X_poly = poly_sk.fit_transform(X)
print('SK Feature Names:', poly_sk.get_feature_names())
ls_sk = LinearRegression()
ls_sk.fit(X_poly, Y)
print()

# Section 5: Coefficients Comparison
tmp_ls_pp_coefs = sorted(ls_pp.coefs)
rounded_ls_pp_coefs = [
    round(x, 8) + 0 for x in la.transpose(tmp_ls_pp_coefs)[0]
]
print('PurePy  LS coefficients:', rounded_ls_pp_coefs)

tmp_ls_sk_coefs = ls_sk.intercept_.tolist() + ls_sk.coef_[0][1:].tolist()
tmp_ls_sk_coefs = sorted(tmp_ls_sk_coefs)
rounded_ls_sk_coefs = [round(x, 8) + 0 for x in tmp_ls_sk_coefs]
print('SKLearn LS coefficients:', rounded_ls_sk_coefs, '\n')
# In[84]:


from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2).fit(X_train_transformed)
X_train_poly = poly.transform(X_train_transformed_scaled)
X_test_poly = poly.transform(X_test_transformed_scaled)


# In[68]:


# Debug
print(poly.get_feature_names())


# In[77]:


from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

## Get score using original model
logreg = LogisticRegression(C=1)
logreg.fit(X_train, y_train)
scores = cross_val_score(logreg, X_train, y_train, cv=10)
print('CV accuracy (original): %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))
highest_score = np.mean(scores)
if DIVISION:
    X_recip = (1 / X).copy()
    X_recip.columns = ["recip_" + str(c) for c in X_recip.columns]
    X_recip = X_recip.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
    X = pd.concat([X, X_recip], axis=1)
    del X_recip

# determine if we are building a classifier model
classifier = np.all(np.unique(Y.to_numpy()) == [0, 1])
outputs = Y.shape[1]

# add 2nd order polynomial features to X
poly = PolynomialFeatures(2, include_bias=False)
x_columns = X.columns
X = pd.DataFrame(poly.fit_transform(X))
X.columns = poly.get_feature_names(x_columns)

# set up the model
if classifier:
    selector = RFE(RandomForestClassifier(n_estimators=50,
                                          max_depth=14,
                                          min_samples_leaf=5,
                                          max_features="sqrt",
                                          random_state=42,
                                          class_weight="balanced_subsample",
                                          n_jobs=1),
                   step=0.05,
                   verbose=1)
else:
    selector = RFE(RandomForestRegressor(n_estimators=50,
                                         max_depth=14,
Exemple #30
0
    print(features)

    poly = PolynomialFeatures(poly_degree)

    Y = np.array(y_id)
    reg_u = np.full(X_deg, avg_u_id)
    reg_y = np.full(AR_deg, avg_y_id)
    PHI = []
    for i in tqdm(range(ID_LENGHT)):
        if i != 0:
            reg_y = np.append(reg_y, Y[i])[1:]
            reg_u = np.append(reg_u, u_id[i])[1:]
        regressors = np.append(reg_u, reg_y)
        PHI.append(poly.fit_transform([regressors])[0])
    PHI = np.array(PHI)
    regressor_terms = poly.get_feature_names(features)
    print("Regressors: ", regressor_terms)

    #FROE 2
    poly = PolynomialFeatures(poly_degree)

    Y_val = np.array(y_val)
    reg_u = np.full(X_deg, avg_u_val)
    reg_y = np.full(AR_deg, avg_y_val)
    PHI_val = []
    for i in tqdm(range(VAL_LENGHT)):
        if i != 0:
            reg_y = np.append(reg_y, Y_val[i])[1:]
            reg_u = np.append(reg_u, u_val[i])[1:]
        regressors = np.append(reg_u, reg_y)
        PHI_val.append(poly.fit_transform([regressors])[0])
Exemple #31
-1
def analysis_7(df_Coredata):
	""" 多次元多項式モデル """

	#https://www.jeremyjordan.me/polynomial-regression/

	X = df_Coredata[['d','e','f','g','i']]
	y = df_Coredata['j']

	# グラフのスタイルを指定
	sns.set(style = 'whitegrid', context = 'notebook')
	# 変数のペアの関係をプロット
	#sns.pairplot(df_Coredata)
	#plt.show()


	#X_train, X_test, y_train, y_test  =  train_test_split(X,y,random_state = 0)
	#lr = linear_model.LinearRegression().fit(X_train, y_train)
	#print("Trainng set score: {:.2f}".format(lr.score(X_train, y_train)))
	#print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))

	### データのスケール変換
	# 標準化
	std_Scaler = StandardScaler()
	data_std = std_Scaler.fit_transform(X)

	mmx_Scaler =MinMaxScaler()
	X_scaled = mmx_Scaler.fit_transform(X)
	#X_test_scaled = scaler.transform(X_test)

	#print(X_train_scaled)

	poly = PolynomialFeatures(degree = 2).fit(data_std)
	print(poly.get_feature_names())