def preparation_data(self, df): # Standarisasi Kolom Numerik kolom_numerik = ['Umur', 'NilaiBelanjaSetahun'] # Statistik sebelum Standardisasi print('Statistik Sebelum Standardisasi\n') print(df[kolom_numerik].describe().round(1)) # Standardisasi df_std = StandardScaler().fit_transform(df[kolom_numerik]) # Membuat DataFrame df_std = pd.DataFrame(data=df_std, index=df.index, columns=df[kolom_numerik].columns) # Menampilkan contoh isi data dan summary statistic print('Contoh hasil standardisasi\n') print(df_std.head()) print('Statistik hasil standardisasi\n') print(df_std.describe().round(0)) # Konversi Kategorikal Data # Inisiasi nama kolom kategorikal kolom_kategorikal = ['Jenis Kelamin', 'Profesi', 'Tipe Residen'] # Membuat salinan data frame df_encode = df[kolom_kategorikal].copy() # Melakukan labelEncoder untuk semua kolom kategorikal for col in kolom_kategorikal: df_encode[col] = LabelEncoder().fit_transform(df_encode[col]) # Menampilkan data print(df_encode.head()) # Menggabungkan data frame df_model = df_encode.merge(df_std, left_index=True, right_index=True, how='left') print(df_model.head())
# Statistik sebelum Standardisasi print('Statistik Sebelum Standardisasi\n') print(df[kolom_numerik].describe().round(1)) # Standardisasi df_std = StandardScaler().fit_transform(df[kolom_numerik]) # Membuat DataFrame df_std = pd.DataFrame(data=df_std, index=df.index, columns=df[kolom_numerik].columns) # Menampilkan contoh isi data dan summary statistic print('Contoh hasil standardisasi\n') print(df_std.head()) print('Statistik hasil standardisasi\n') print(df_std.describe().round(0)) # # [Konversi Kategorikal Data dengan Label Encoder](https://academy.dqlab.id/main/livecode/293/562/2809) # In[10]: from sklearn.preprocessing import LabelEncoder # Inisiasi nama kolom kategorikal kolom_kategorikal = ['Jenis Kelamin', 'Profesi', 'Tipe Residen'] # Membuat salinan data frame df_encode = df[kolom_kategorikal].copy()
element = float(element) except ValueError: #print("error",e," happens!") element = 0 rows.append(element) df_price[nf] = rows df_price = df_price.loc[:, day_features].replace(np.nan,0).values # Standardizing the features df_price = StandardScaler().fit_transform(df_price) # add columns' name df_price = pd.DataFrame(df_price, columns = day_features) dataset = df_price.values print("price feature sample: ") print(df_price.head()) # PAA transformation # PAA transform (and inverse transform) of the data n_paa_segments = 3 paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments) paa_list = [] for item in df_price.values: item = item.reshape((1,5,1)) paa_price_inv = paa.inverse_transform(paa.fit_transform(item)) paa_list.append(paa_price_inv) paa_array = np.array(paa_list) paa_data = paa_array.reshape(1904, 5) paa_df = pd.DataFrame(paa_data, columns = day_features)
df_ss = StandardScaler().fit_transform(df) # In[15]: #Create a dataframe df_ss = pd.DataFrame(df_ss) df_ss.columns = [ 'Sports', 'SUV', 'Wagon', 'Minivan', 'Pickup', 'AWD', 'RWD', 'Retail', 'Dealer', 'Engine', 'Cylinders', 'Horsepower', 'CityMPG', 'HighwayMPG', 'Weight', 'Wheelbase', 'Length', 'Width' ] # In[16]: #Check first few rows df_ss.head() # In[17]: df_ss.describe() # In[18]: #Reduce feature down to 3 comp = 3 pca, X_pca = do_pca(comp, df_ss) # In[19]: X_pca
labels[Pn] = C elif labels[Pn] == 0: labels[Pn] = C PnNeighborPts = regionQuery(D, Pn, eps) if len(PnNeighborPts) >= MinPts: NeighborPts = NeighborPts + PnNeighborPts i += 1 def regionQuery(D, P, eps): neighbors = [] for Pn in range(0, len(D)): if numpy.linalg.norm(D[P] - D[Pn]) < eps: neighbors.append(Pn) return neighbors import pandas as pd X = pd.read_csv("clustering.csv") X = StandardScaler().fit_transform(X) MyDBSCAN(X, 0.3, 5) headers = ["x", "y"] X = pd.read_csv("clustering.csv", names=headers) X.head() plt.scatter(X.x, X.y) plt.show()
- PCA is a dimensionality reduction technique that reduces less-informative 'noise' features. - But PCA is sensitive to variance and different scales, so standardizing will help PCA perform better. - However, since we found that the correlation between different features in the training dataset is not that significant, so using PCA might not be meaningful. """ from sklearn.preprocessing import StandardScaler #scaling training data between -1 to +1 standardized_train = StandardScaler().fit_transform(train.set_index(['ID_code','target'])) standardized_train standardized_train = pd.DataFrame(standardized_train, columns=train.set_index(['ID_code','target']).columns) standardized_train = train[['ID_code','target']].join(standardized_train) standardized_train.head(10) from sklearn.decomposition import PCA k=80 pca = PCA(n_components=k, random_state=42, whiten=True) pca.fit(standardized_train.set_index(['ID_code','target'])) sum(pca.explained_variance_ratio_) plt.figure(figsize=(26,9)) plt.plot(pca.explained_variance_ratio_) plt.xticks(range(k)) plt.xlabel("Number of Features") plt.ylabel("Proportion of variance explained by additional feature") """Normally, if there is a elbow looking point in the graph above, the x value(number of features) of that point is usually the ideal number of components for PCA.
classifier = RandomForestClassifier(max_depth=2, random_state=0) classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) #Performance Evaluation cm = confusion_matrix(y_test, y_pred) print(cm) accuracy_score(y_test, y_pred) ###### Predicting Customer Lifetime Value using Regression df2 = pd.get_dummies(df) df2.head() x = df2.drop(columns='customer lifetime value') x.head() y = df2['customer lifetime value'] y.head() # OLS Regression x_1 = sm.add_constant(x) model = sm.OLS(y, x_1).fit() model.pvalues model.summary() # R^2 is 0.167 # use backward elimination cols = list(x.columns) pmax = 1 while (len(cols) > 0): p = [] x_1 = x[cols]
import seaborn as sb from sklearn.preprocessing import StandardScaler from sklearn.cross_validation import train_test_split from sklearn.metrics import classification_report, confusion_matrix from sklearn.neighbors import KNeighborsClassifier df = pd.read_csv('KNN_Project.csv') df = pd.DataFrame(df) print(df.head()) sb.pairplot(df, hue='TARGET CLASS') plt.show() scalar = StandardScaler() scalar.fit(df.drop('TARGET CLASS', axis=1)) scalar = scalar.transform(df.drop('TARGET CLASS', axis=1)) scalar = pd.DataFrame(scalar, columns=df.columns[:-1]) print(scalar.head()) X = scalar y = df['TARGET CLASS'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101) arr = [] for i in range(1, 40): knn = KNeighborsClassifier(n_neighbors=i) knn.fit(X_train, y_train) pred = knn.predict(X_test) arr.append(np.mean(pred != y_test)) plt.figure(figsize=(12, 5)) plt.plot(range(1, 40), arr,
# <h4>Feature scaling</h4> # <p>As we will go from basic to advance model we will first use logistic regression and we know logistic regression # is distance based method we will use scale our feature </p> # In[24]: ##Scaling features as we will use distance base model also like Logistic regression df_train_custom = df_train.drop(["target", "ID_code"], axis=1) df_train_custom = StandardScaler().fit_transform(df_train_custom) df_train_custom = pd.DataFrame(df_train_custom) df_train_custom = df_train[['ID_code', 'target']].join(df_train_custom) # In[25]: df_train_custom.head() # <h4>Feature Importance</h4> # In[26]: ##Getting feature importance to get insights of some features ##Running random forest with gridseacrhcv for hyperparameter tunning parameters = {'min_samples_leaf': [20, 25]} forest = RandomForestClassifier(max_depth=15, n_estimators=15) grid = GridSearchCV(forest, parameters, cv=3, n_jobs=-1, verbose=2, scoring=make_scorer(roc_auc_score))
# In[177]: fraud_data.TransactionAmt.max() # In[183]: #Standardization-/-Normalization from sklearn.preprocessing import StandardScaler scaled_features = StandardScaler().fit_transform(X) scaled_features = pd.DataFrame(data = scaled_features) scaled_features.columns = X.columns scaled_features.head() # In[187]: #Splitting the data from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3, random_state = 42) # X_train: independent feature data for training the model # Y_train: dependent feature data for training the model # X_test: independent feature data for testing the model; will be used to predict the target values # Y_test: original target values of X_test; We will compare this values with our predicted values. # test_size = 0.3: 30% of the data will go for test set and 70% of the data will go for train set